diff --git a/CMakeLists.txt b/CMakeLists.txt
index 265ddc9504167f21f54a1b1e7777147b3b6d37d9..fb796103350ac4403d4151cf08eb4315bcde68fd 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,10 @@ include(generic)            # simplify cmake module
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+if (WITH_GPU  AND WITH_XPU)
+    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
     message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
diff --git a/Dockerfile b/Dockerfile
index 42a103240e882b2732f14619308cc00f010d20af..b92ac228a8d50da93f8c0bfe2f6af31fc784f2c7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
+
 RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
 RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 
-RUN pip3 --no-cache-dir install coverage                
-RUN pip3.6 --no-cache-dir install coverage             
-RUN pip3.7 --no-cache-dir install coverage            
+RUN pip3 --no-cache-dir install coverage
+RUN pip3.6 --no-cache-dir install coverage
+RUN pip3.7 --no-cache-dir install coverage
 RUN pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/README.md b/README.md
index 4196811e37f73f84b0327f5cbf1996aaaf7e6dcc..d14d0ef00148140bd931bbc692fbe15bb21a7bf3 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.3.post97
+pip install paddlepaddle-gpu==1.8.4.post97
 
 ```
 It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
diff --git a/README_cn.md b/README_cn.md
index 93ad06d20010fcba1ff3382b169cb78328f2a375..e4544a3eff6e55a29fcfa806786e55e0ac41a672 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -30,7 +30,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.3.post97
+pip install paddlepaddle-gpu==1.8.4.post97
 
 ```
 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bb57b42dcc74114312a400a0f6cc95df307de6bb..cf458d97706755e794c5fbb1ba9d3fcb51e9d1ce 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
     add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
+if(WITH_XPU)
+    message(STATUS "Compile with XPU!")
+    add_definitions(-DPADDLE_WITH_XPU)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 1688d9d98b7a079b32322d03c46cd6d89b717881..b7a93cd9ee2160090c0142d62d96da72e4c58717 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
   if(NOT CUDA_gpu_detect_output)
     message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
     set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+    #Todo: fix Automatic GPU detection failed on windows
+    if(WIN32)
+      set(${out_variable} "61 75" PARENT_SCOPE)
+    endif()
   else()
     set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
   endif()
@@ -202,6 +206,11 @@ if (NOT WIN32) # windows msvc2015 support c++11 natively.
   set(CMAKE_CUDA_STANDARD 11)
 endif(NOT WIN32)
 
+# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
+# So replace /W[1-4] with /W0
+if (WIN32)
+  string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+endif(WIN32)
 # in cuda9, suppress cuda warning on eigen
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index 4a343b2c6af2ce64d65203ae5955b2d552055198..6f790f1af8e1a03d1101244a4d82045331b44c13 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
 set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
 set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
-set(CUB_TAG        1.9.8)
+set(CUB_TAG        1.8.0)
 
 cache_third_party(extern_cub
     REPOSITORY    ${CUB_REPOSITORY}
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 337e326dc166fd844a938ecd936d8c4162a45573..895bc0849a2a3b57e9e7ba2576567032f07fb35b 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -14,13 +14,21 @@
 
 INCLUDE(ExternalProject)
 
+execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
+
 SET(GLOO_PROJECT       "extern_gloo")
 IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
   MESSAGE(STATUS "use pre defined download url")
   SET(GLOO_VER "master" CACHE STRING "" FORCE)
   SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-  SET(GLOO_URL "https://pslib.bj.bcebos.com/gloo.tar.gz" CACHE STRING "" FORCE)
+
+  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
+    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
+  else()
+    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
+  endif()
 ENDIF()
+
 MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
 SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
 SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9f3606138defa04f979d8bea348e7bfda181af68..ae870b766fc3349ea53628e14c68ab9a5826213f 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            fb95345126ade4c54f5507e580a5f5da8d30a515)
+SET(MKLDNN_TAG            1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8a927d8e282a03e8a74c0814ee8d9b247451a091
--- /dev/null
+++ b/cmake/external/xpu.cmake
@@ -0,0 +1,54 @@
+if (NOT WITH_XPU)
+    return()
+endif()
+
+INCLUDE(ExternalProject)
+SET(XPU_PROJECT                 "extern_xpu")
+SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
+SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB_NAME            "libxpuapi.so")
+SET(XPU_RT_LIB_NAME             "libxpurt.so")
+SET(XPU_SIM_LIB_NAME            "libxpusim.so")
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
+                          && tar xvf xpu.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
+
+# generate a static dummy target to track xpulib dependencies
+# for cc_library(xxx SRCS xxx.c DEPS xpulib)
+generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
+
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 64878693518b686cc208c293c0ad0b410fa26058..9d07a0979d9392c9b2ab78562f8e0ceb8fc5d722 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -232,7 +232,9 @@ if(WIN32)
         CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
-        set(flag_var "${flag_var} /w")
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
     endforeach(flag_var)
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 8842e8e21c6df224bb6341a4f7f526e3d61e92e1..1956e5c39ea2524d8a8e2650eb08f8d58f410b73 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -384,8 +384,12 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    # No unit test should exceed 2 minutes.
+    if (APPLE OR WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+    endif()
   endif()
 endfunction()
 
@@ -742,9 +746,14 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
+    
+    if (APPLE OR WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    else()
+        # No unit test should exceed 2 minutes in Linux.
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+    endif()
 
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction()
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 5a889dbc3143833ff48a972d17efc0aaf63f1810..20f27715e00457a8fe43f5c620e2a005387d7988 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
             SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
             DSTS ${dst_dir} ${dst_dir}/lib)
 
+    if (WITH_CRYPTO)
         set(dst_dir "${DST}/third_party/install/cryptopp")
         copy(${TARGET}
-        SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
+            SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+            DSTS ${dst_dir} ${dst_dir}/lib)
+    endif()
 
     set(dst_dir "${DST}/third_party/install/xxhash")
     copy(${TARGET}
@@ -187,7 +189,7 @@ copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
 copy(inference_lib_dist
-        SRCS  ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
diff --git a/cmake/init.cmake b/cmake/init.cmake
index a33bfdbd412b15ef8d35c18b38ba6e18a1d03b11..7dfe60f9dd8f021facba6925a465cb58bc5de25d 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -1,29 +1,29 @@
 # Attention: cmake will append these flags to compile command automatically.
 # So if you want to add global option, change this file rather than flags.cmake
 
-# default: "-g"
-set(CMAKE_C_FLAGS_DEBUG "-g")
-# default: "-O3 -DNDEBUG"
-set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
-# default: "-O2 -g -DNDEBUG"
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-# default: "-Os -DNDEBUG"
-set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+# NOT WIN32
+# DEBUG:  default: "-g"
+# RELEASE:  default: "-O3 -DNDEBUG"
+# RELWITHDEBINFO: default: "-O2 -g -DNDEBUG"
+# MINSIZEREL: default: "-O2 -g -DNDEBUG"
+
+if(NOT WIN32)
+    set(CMAKE_C_FLAGS_DEBUG "-g")
+    set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+
+    set(CMAKE_CXX_FLAGS_DEBUG "-g")
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+endif()
+
+if(WITH_GPU)
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
+endif()
 
-# default: "-g"
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-# default: "-O3 -DNDEBUG"
-set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
-# default: "-O2 -g -DNDEBUG"
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-# default: "-Os -DNDEBUG"
-set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 
-# default: "-g"
-set(CMAKE_CUDA_FLAGS_DEBUG "-g")
-# default: "-O3 -DNDEBUG"
-set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
-# default: "-O2 -g -DNDEBUG"
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-# default: "-O1 -DNDEBUG"
-set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 5b03cbf8c7f844e163020ca17d25dc4b732fe636..f60a6dc3f0c89dd345b04ea3a1e213de770e5760 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -8,12 +8,13 @@ function(op_library TARGET)
     set(hip_cu_srcs)
     set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
+    set(xpu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(cudnn_cu_srcs)
     set(CUDNN_FILE)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function layer)
+    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -60,6 +61,12 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
             endif()
         endif()
+        if(WITH_XPU)
+            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            endif()
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.hip.cu$")
@@ -76,6 +83,8 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
+            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+                list(APPEND xpu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -109,7 +118,7 @@ function(op_library TARGET)
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
     endif()
 
@@ -118,7 +127,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -150,10 +159,11 @@ function(op_library TARGET)
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH hip_cu_srcs hip_cu_srcs_len)
     list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -179,6 +189,9 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
     endif()
 
+    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
       # Append first implemented MKLDNN activation operator
@@ -228,6 +241,7 @@ function(register_operators)
 
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE "_xpu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
     list(LENGTH register_operators_DEPS register_operators_DEPS_len)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index be536b2eefbb123d73ec6f8d17c3d22e5aca2cfc..c9442e8f843ac152cac02908799a8d24f5951e58 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -250,6 +250,11 @@ if(WITH_GPU)
     file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
 
+if(WITH_XPU)
+    include(external/xpu)          # download, build, install xpu
+    list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
+
 if(WITH_PSLIB)
     include(external/pslib)          # download, build, install pslib
     list(APPEND third_party_deps extern_pslib)
@@ -263,10 +268,6 @@ if(WITH_PSLIB)
     endif()
 endif(WITH_PSLIB)
 
-if(NOT WIN32 AND NOT APPLE)
-    include(external/gloo)
-    list(APPEND third_party_deps extern_gloo)
-endif()
 
 if(WITH_BOX_PS)
     include(external/box_ps)
@@ -274,6 +275,11 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 
 if(WITH_DISTRIBUTE)
+    if(WITH_GLOO)
+        include(external/gloo)
+        list(APPEND third_party_deps extern_gloo)
+    endif()
+
     if(WITH_GRPC)
         list(APPEND third_party_deps extern_grpc)
     else()
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e1cb683e1ecf12d507a954003a8fae6312b85324..10d2c2c6c9172ef2025d72e1723d74c8423aed1d 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(fleet)
 add_subdirectory(io)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
   data_feed_proto)
@@ -121,6 +122,10 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
 cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
+
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
+
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
@@ -163,23 +168,23 @@ if(WITH_PYTHON)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
-      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   else(NOT WIN32)
     string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
-    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/")
+    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
-	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
+	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-	  COMMENT "Copy generated python proto into directory paddle/fleet/proto."
+	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()
@@ -195,20 +200,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o
 if(WITH_DISTRIBUTE)
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc
+  heterxpu_trainer.cc
+  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer
+  device_context scope framework_proto trainer_desc_proto glog fs shell
+  fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
   lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
-  graph_to_program_pass variable_helper data_feed_proto timer monitor)
+  graph_to_program_pass variable_helper data_feed_proto timer monitor
+  heter_service_proto)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+elseif(WITH_PSLIB)
+  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
+  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
+  heterxpu_trainer.cc
+  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
+  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
+  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
+  lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+  graph_to_program_pass variable_helper timer monitor pslib_brpc )
+  # TODO: Fix these unittest failed on Windows
+  if(NOT WIN32)
+    cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
+  endif()
 else()
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc
+  heterxpu_trainer.cc
+  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto data_feed_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method
+  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
+  lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
   graph_to_program_pass variable_helper timer monitor)
   # TODO: Fix these unittest failed on Windows
   if(NOT WIN32)
@@ -250,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
 
 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
+cc_library(generator SRCS generator.cc)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index 1cf4eb6c2989346c9e9acef648aa74615c7bcb10..d42bd0b16d7a84987517326af9567809fd29da4d 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
 // get CommContext and remote send and recv op
 void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-  // init communicator here
-  auto *instance = operators::distributed::Communicator::GetInstance();
-  auto initialized = instance ? true : false;
-  PADDLE_ENFORCE_EQ(initialized, true,
-                    platform::errors::InvalidArgument(
-                        "Communicator is not Initialized, you may use "
-                        "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
-                        "develop/markdown_doc/transpiler)"));
+
+  bool need_communicator = false;
+
+  for (auto &node : graphs[0]->Nodes()) {
+    VLOG(3) << "node name " << node->Name();
+    if (node && node->IsOp()) {
+      if (node->Name() == "send") {
+        auto send_varnames =
+            BOOST_GET_CONST(std::vector<std::string>,
+                            node->Op()->GetNullableAttr("send_varnames"));
+
+        if (send_varnames.size() > 0) {
+          need_communicator = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (need_communicator) {
+    // init communicator here
+    auto *instance = operators::distributed::Communicator::GetInstance();
+    auto initialized = instance ? true : false;
+    PADDLE_ENFORCE_EQ(initialized, true,
+                      platform::errors::InvalidArgument(
+                          "Communicator is not Initialized, you may use "
+                          "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
+                          "develop/markdown_doc/transpiler)"));
+  }
 
 #endif
 }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 0bd6a79b55392e8bfb8f33b0a29b4cf1df0d44dc..5574a55e18c6d9806cb878dc69ec597f81da97d8 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
-      dst_item->ShareDataWith(src_item);
+      TensorCopy(src_item, platform::CPUPlace(), dst_item);
     }
   } else {
     dst_item->clear();
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 05c18544d6a2f7c8998d765fdad292bc4330bb38..3336b5783a8cf14358db5b74ae799ba470a640fa 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/heter_service.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
 
 class FleetWrapper;
 
+#ifdef PADDLE_WITH_PSLIB
+class HeterWrapper;
+#endif
+
 class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
   virtual void Initialize(const TrainerDesc& param);
+#ifdef PADDLE_WITH_CUDA
+  void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+
+  void AddPlace(const paddle::platform::Place place) {
+    places_.push_back(place);
+  }
+
+  void AddThreadScope(Scope* scope) { thread_scopes_.push_back(scope); }
+#endif
   int Start();
   void Stop();
   void SetRootScope(Scope* scope) { root_scope_ = scope; }
@@ -62,6 +76,7 @@ class PullDenseWorker {
   void ResetThreadVersion(uint64_t table_id);
   void Wait(std::vector<::std::future<int32_t>>* status_vec);
   void PullDense(bool force_update = false);
+  void CreatePinVar();
   int GetThreadIdByScope(const Scope* scope);
   void SetThreadIdByScope(const Scope* scope, int tid);
   static std::shared_ptr<PullDenseWorker> GetInstance() {
@@ -105,6 +120,12 @@ class PullDenseWorker {
   std::mutex mutex_for_mean_scale_;
   float total_batch_num_ = 0;
   std::unordered_map<const Scope*, int> scope_to_thread_id_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::vector<cudaStream_t> copy_streams_;
+  std::vector<paddle::platform::Place> places_;
+  std::vector<Scope*> thread_scopes_;
+#endif
 };
 
 // should incorporate different type of device
@@ -126,6 +147,8 @@ class DeviceWorker {
   virtual void BindingDataFeedMemory() = 0;
   virtual void SetRootScope(Scope* root_scope);
   virtual void SetDataFeed(DataFeed* data_feed);
+  virtual void SetWorkerNum(int num) {}
+  virtual void CacheProgram(const ProgramDesc& main_program) {}
   virtual void SetNeedDumpField(bool need_dump_field) {
     need_dump_field_ = need_dump_field;
   }
@@ -161,6 +184,7 @@ class DeviceWorker {
   FetchConfig fetch_config_;
   bool use_cvm_;
   bool no_cvm_;
+  TrainerDesc trainer_desc_;
 
   // dump params or grads for debug
   bool need_dump_param_;
@@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker {
   uint64_t async_tid_ = 0;
 };
 
+#ifdef PADDLE_WITH_PSLIB
+class HeterCpuWorker : public HogwildWorker {
+ public:
+  HeterCpuWorker() {}
+  virtual ~HeterCpuWorker() {}
+  virtual void Initialize(const TrainerDesc& desc);
+  virtual void TrainFiles();
+  virtual void TrainFilesWithProfiler();
+  virtual void SetNeedDump(bool need_dump_field);
+  virtual void SetChannelWriter(ChannelObject<std::string>* queue);
+  virtual void SetWorkerNum(int num) { worker_num_ = num; }
+  virtual void Schedule(int taskid);
+  virtual void JumpContext(std::shared_ptr<HeterTask> task);
+  virtual void CacheProgram(const ProgramDesc& main_program) {
+    new (&program_) ProgramDesc(main_program);
+  }
+  virtual void GetXpuOpIndex();
+
+ protected:
+  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
+  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
+  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
+  void FillSparseValue(std::shared_ptr<HeterTask> task, size_t table_id);
+  void PushGradients();
+  void CollectLabelInfo(std::shared_ptr<HeterTask> task, size_t table_id);
+  void AdjustInsWeight(std::shared_ptr<HeterTask> task);
+  void DumpParam();
+  void CopySparseTable();
+  void CopyDenseTable();
+  void CopyDenseVars();
+
+ private:
+  int mpi_rank_;
+  int worker_num_;
+  int xpu_begin_op_index_;
+  int xpu_end_op_index_;
+  ProgramDesc program_;
+  HeterObjectPool<HeterTask> object_pool_;
+  HeterList<int, std::shared_ptr<HeterTask>> run_queue_;
+  HeterList<int, std::shared_ptr<HeterTask>> wait_queue_;
+  bool need_dump_param_;
+  std::vector<std::string> dump_param_;
+  bool need_to_push_dense_;
+  bool need_dump_field_;
+  bool dump_slot_;
+  bool need_to_push_sparse_;
+  std::vector<std::string> dump_fields_;
+  ChannelWriter<std::string> writer_;
+  DownpourWorkerParameter param_;
+  float scale_datanorm_;
+  // just save the value in param_ for easy access
+  std::map<uint64_t, std::string> label_var_name_;
+  std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
+  std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
+  std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
+  std::map<uint64_t, std::vector<std::string>> dense_value_names_;
+  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
+  platform::Place root_place_;
+  // actually pushed feasign of each table
+  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
+
+  // skipped ops
+  std::vector<std::string> skip_ops_;
+
+  std::vector<::std::future<int32_t>> push_sparse_status_;
+  std::vector<::std::future<int32_t>> push_dense_status_;
+
+  // adjust ins weight
+  AdjustInsWeightConfig adjust_ins_weight_config_;
+  std::vector<float> nid_show_;
+  // check nan and inf during training
+  std::vector<std::string> check_nan_var_names_;
+  // copy table
+  CopyTableConfig copy_table_config_;
+  std::map<uint64_t, uint64_t> table_dependency_;
+  std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
+  std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
+  std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
+};
+#endif
+
 #if defined(PADDLE_WITH_NCCL)
 class SectionWorker : public DeviceWorker {
  public:
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 80e4000c9dc686bc413b38fcf8298dc8b5399335..67be8db6e80329de4323e4cb8f904a24753f56bc 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -62,6 +62,9 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
 REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
 REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
 REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
+#ifdef PADDLE_WITH_PSLIB
+REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
+#endif
 #if defined(PADDLE_WITH_NCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc
index e2a7375df9e46713aebe9f815f93809568b86c0f..4d55d2987f3f39525c1070e3213f3a2e84e18dff 100644
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ b/paddle/fluid/framework/dist_multi_trainer.cc
@@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
   dump_file_num_ = trainer_desc.dump_file_num();
   const std::vector<paddle::framework::DataFeed *> readers =
       dataset->GetReaders();
-
+  RegisterHeterCallback();
   thread_num_ = readers.size();
   workers_.resize(thread_num_);
   for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
@@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
     workers_[i]->SetDumpParamVector(dump_param_);
     workers_[i]->InitRandomDumpConfig(trainer_desc);
     workers_[i]->Initialize(trainer_desc);
+    workers_[i]->SetWorkerNum(thread_num_);
   }
 
   VLOG(3) << "going to initialize pull dense worker";
@@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
   SetDebug(trainer_desc.debug());
 }
 
+void DistMultiTrainer::RegisterHeterCallback() {
+  auto fleet_ptr = FleetWrapper::GetInstance();
+  fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) {
+    // workers_[worker]->Schedule(taskid);
+  });
+}
+
 void DistMultiTrainer::InitDumpEnv() {
   queue_ = paddle::framework::MakeChannel<std::string>();
   for (int i = 0; i < thread_num_; ++i) {
@@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
     workers_[i]->SetRootScope(root_scope_);
     workers_[i]->CreateDeviceResource(main_program);  // Program
     workers_[i]->BindingDataFeedMemory();
+#ifdef PADDLE_WITH_PSLIB
+    workers_[i]->CacheProgram(main_program);
+#endif
   }
   // Scope* -> thread id, it will be used in push_dense op
   for (int i = 0; i < thread_num_; ++i) {
@@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
   }
   pull_dense_worker_->SetRootScope(root_scope_);
   pull_dense_worker_->Start();
+#ifdef PADDLE_WITH_PSLIB
+  for (int i = 0; i < thread_num_; ++i) {
+    workers_[i]->GetXpuOpIndex();
+  }
+#endif
   VLOG(3) << "init other env done.";
 }
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 05b7a16f1594f370cbf73ab7fdb4c98e3bb76024..551d1342edeb335d1cad4782f85ae9f94f8739bd 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -55,9 +55,8 @@ message LarsConfig {
 }
 
 message LambConfig {
-  optional float beta1 = 1 [ default = 0.001 ];
-  optional float beta2 = 2 [ default = 0.999 ];
-  optional float epsilon = 3 [ default = 0.000001 ];
+  optional float lamb_weight_decay = 1 [ default = 0.01 ];
+  repeated string exclude_from_weight_decay = 2;
 }
 
 message BuildStrategy {
@@ -80,7 +79,7 @@ message ExecutionStrategy {
 }
 
 message AsyncConfig {
-  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 k_steps = 1 [ default = -1 ];
   optional int32 max_merge_var_num = 2 [ default = 1 ];
   optional int32 send_queue_size = 3 [ default = 16 ];
   optional bool independent_recv_thread = 4 [ default = false ];
@@ -114,7 +113,9 @@ message DistributedStrategy {
   optional bool fuse_all_reduce_ops = 18 [ default = true ];
   optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
   optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
-  // optional bool enable_backward_optimizer_op_deps = 19 [ default = true ];
+  optional bool cudnn_exhaustive_search = 21 [ default = true ];
+  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
+  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index f2421248e33f236b9fa861f22ce4848531cf1791..180b33d0cb72e2c4c9e6e8caff9f0ef5f1b04689 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
     return ctx;
   }
 
+  inline ::DLContext operator()(const platform::XPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 3f70835c9d312a652cd917ba53fb2f405ab401cc..1c64bf1d3f7f31f42308395b5b054f62fd97b429 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() {
       pull_dense_status.resize(0);
       fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
                                      dense_value_names_[dest_table],
-                                     &pull_dense_status);
+                                     &pull_dense_status, true);
       for (auto& t : pull_dense_status) {
         t.wait();
         auto status = t.get();
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8e2e1d38a66d1039519bab312f77bef6604d8ec1..f11edb9a41bdcbcb33efc600f1d7d8f70fb76f45 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
+#ifdef PADDLE_WITH_CUDA
       if (IsFastEagerDeletionModeEnabled()) {
         gc.reset(new UnsafeFastGPUGarbageCollector(
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
@@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
         gc.reset(new DefaultStreamGarbageCollector(
             BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
       }
-    } else if (platform::is_cpu_place(place_)) {
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
+    } else if (platform::is_cpu_place(place_)) {
       gc.reset(new CPUGarbageCollector(
           BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
-#ifdef PADDLE_WITH_CUDA
-    }
+    } else if (platform::is_xpu_place(place_)) {
+#ifdef PADDLE_WITH_XPU
+      gc.reset(new XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
+    }
   }
 
   for (int64_t i = start_op_index; i < end_op_index; ++i) {
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 6922f92c8f7a3aa43f13fda59f2631f8529d5cc7..3eee0a1abbaf04aef2faa9e52c552e89ce84c7de 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -19,4 +19,6 @@ else()
     cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)
 
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
+
 cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 6f571fa8d817bfa0323ea22b4b14ee96111642c4..335cbc382c178b1a14949764f2908dc402298868 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -154,6 +154,219 @@ void FleetWrapper::CreateClient2ClientConnection() {
 #endif
 }
 
+#ifdef PADDLE_WITH_PSLIB
+void FleetWrapper::HeterPullSparseVars(
+    int workerid, std::shared_ptr<HeterTask> task, const uint64_t table_id,
+    const std::vector<std::string>& var_names, int fea_value_dim,
+    const std::vector<std::string>& var_emb_names) {
+  std::vector<::std::future<int32_t>> pull_sparse_status;
+  pull_sparse_status.resize(0);
+  auto& scope = *(task->scope_);
+  auto& fea_keys = (task->features_)[table_id];
+  auto& fea_values = (task->feature_values_)[table_id];
+  fea_keys.clear();
+  for (size_t var_index = 0; var_index < var_names.size(); ++var_index) {
+    const std::string& name = var_names[var_index];
+    Variable* var = scope.FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
+    int64_t* ids = tensor->data<int64_t>();
+    size_t len = tensor->numel();
+
+    // skip slots which do not have embedding
+    const std::string& emb_name = var_emb_names[var_index];
+    Variable* emb_var = scope.FindVar(emb_name);
+    if (emb_var == nullptr) {
+      continue;
+    }
+
+    for (auto i = 0u; i < len; ++i) {
+      if (ids[i] == 0u) {
+        continue;
+      }
+      fea_keys.push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  fea_values.resize(fea_keys.size() + 1);
+  for (auto& t : fea_values) {
+    t.resize(fea_value_dim);
+  }
+  std::vector<float*> pull_result_ptr;
+  for (auto& t : fea_values) {
+    pull_result_ptr.push_back(t.data());
+  }
+  auto status = pslib_ptr_->_worker_ptr->heter_pull_sparse(
+      workerid, pull_result_ptr.data(), table_id, fea_keys.data(),
+      fea_keys.size(), task->taskid_);
+  pull_sparse_status.push_back(std::move(status));
+  for (auto& t : pull_sparse_status) {
+    t.wait();
+    auto status = t.get();
+    if (status != 0) {
+      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
+      sleep(sleep_seconds_before_fail_exit_);
+      exit(-1);
+    }
+  }
+}
+
+void FleetWrapper::HeterPushSparseVars(
+    std::shared_ptr<HeterTask> task, const uint64_t table_id,
+    const std::vector<std::string>& sparse_key_names,
+    const std::vector<std::string>& sparse_grad_names, const int emb_dim,
+    std::vector<::std::future<int32_t>>* push_sparse_status, const bool use_cvm,
+    const bool dump_slot, const bool no_cvm) {
+  auto& scope = *(task->scope_);
+  int batch_size = task->cur_batch_;
+  int offset = 2;
+  int slot_offset = 0;
+  int grad_dim = emb_dim;
+  int show_index = 0;
+  int click_index = 1;
+  auto& fea_keys = (task->features_)[table_id];
+  auto& fea_labels = (task->feature_labels_)[table_id];
+  auto& push_values = (task->feature_grads_)[table_id];
+  auto& sparse_push_keys = (task->sparse_push_keys_)[table_id];
+
+  if (use_cvm) {
+    offset = 0;
+    grad_dim = emb_dim - 2;
+  }
+  if (no_cvm) {
+    offset = 0;
+    grad_dim = emb_dim;
+  }
+  if (dump_slot) {
+    slot_offset = 1;
+    show_index = 1;
+    click_index = 2;
+  }
+  CHECK_GE(grad_dim, 0);
+
+  sparse_push_keys.clear();
+  sparse_push_keys.reserve(fea_keys.size() + 1);
+  push_values.resize(fea_keys.size() + 1);
+  for (auto& t : push_values) {
+    t.resize(emb_dim + offset + slot_offset);
+  }
+  uint64_t fea_idx = 0u;
+  for (size_t i = 0;
+       i < sparse_key_names.size() && i < sparse_grad_names.size(); ++i) {
+    Variable* var = scope.FindVar(sparse_key_names[i]);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    if (tensor == nullptr) {
+      LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
+      exit(-1);
+    }
+    size_t len = tensor->numel();
+    int64_t* ids = tensor->data<int64_t>();
+    int slot = 0;
+    if (dump_slot) {
+      slot = boost::lexical_cast<int>(sparse_key_names[i]);
+    }
+    Variable* g_var = scope.FindVar(sparse_grad_names[i]);
+    if (g_var == nullptr) {
+      continue;
+    }
+    LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
+    if (g_tensor == nullptr) {
+      LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
+      exit(-1);
+    }
+    float* g = g_tensor->data<float>();
+
+    if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
+      int dim = emb_dim + offset;
+      Eigen::Map<
+          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+          g_mat(g, g_tensor->numel() / dim, dim);
+      g_mat.rightCols(grad_dim) *= batch_size;
+    }
+    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
+      if (ids[id_idx] == 0) {
+        g += emb_dim;
+        continue;
+      }
+      sparse_push_keys.push_back(ids[id_idx]);
+      CHECK(fea_idx < push_values.size());
+
+      if (use_cvm || no_cvm) {
+        memcpy(push_values[fea_idx].data() + offset + slot_offset, g,
+               sizeof(float) * emb_dim);
+      } else {
+        CHECK(fea_idx < fea_labels.size());
+        memcpy(push_values[fea_idx].data() + offset + slot_offset, g,
+               sizeof(float) * emb_dim);
+        push_values[fea_idx][show_index] = 1.0f;
+        push_values[fea_idx][click_index] =
+            static_cast<float>(fea_labels[fea_idx]);
+      }
+      if (dump_slot) {
+        push_values[fea_idx][0] = static_cast<float>(slot);
+      }
+      g += emb_dim;
+      fea_idx++;
+    }
+  }
+  // slots whose embedding has been stop gradient or
+  // not involved in forward-backward
+  uint64_t no_grad_fea_num = 0u;
+  for (size_t i = sparse_grad_names.size(); i < sparse_key_names.size(); ++i) {
+    Variable* var = scope.FindVar(sparse_key_names[i]);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    if (tensor == nullptr) {
+      LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
+      exit(-1);
+    }
+    size_t len = tensor->numel();
+    int64_t* ids = tensor->data<int64_t>();
+    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
+      if (ids[id_idx] == 0) {
+        continue;
+      }
+      ++no_grad_fea_num;
+    }
+  }
+  CHECK(fea_idx + no_grad_fea_num == fea_keys.size())
+      << "fea_idx: " << fea_idx << " no_grad_fea_num: " << no_grad_fea_num
+      << " features size: " << fea_keys.size();
+  CHECK(fea_idx == sparse_push_keys.size());
+  if (fea_idx == 0) {
+    return;
+  }
+  std::vector<float*> push_g_vec;
+  for (auto i = 0u; i < sparse_push_keys.size(); ++i) {
+    push_g_vec.push_back(push_values[i].data());
+  }
+  auto status = pslib_ptr_->_worker_ptr->push_sparse(
+      table_id, sparse_push_keys.data(), (const float**)push_g_vec.data(),
+      sparse_push_keys.size());
+  push_sparse_status->push_back(std::move(status));
+}
+#endif
+
+int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) {
+#ifdef PADDLE_WITH_PSLIB
+  VLOG(3) << "calling FleetWrapper::RegisterHeterCallback";
+  VLOG(3) << "pslib_ptr_=" << pslib_ptr_;
+  VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr;
+  return pslib_ptr_->_worker_ptr->registe_heter_callback(handler);
+#else
+  VLOG(0) << "FleetWrapper::RegisterHeterCallback"
+          << " does nothing when no pslib";
+#endif
+  return 0;
+}
+
 void FleetWrapper::PullSparseToLocal(const uint64_t table_id,
                                      int fea_value_dim) {
 #ifdef PADDLE_WITH_PSLIB
@@ -421,13 +634,17 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
 void FleetWrapper::PullDenseVarsAsync(
     const Scope& scope, const uint64_t tid,
     const std::vector<std::string>& var_names,
-    std::vector<::std::future<int32_t>>* pull_dense_status) {
+    std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu) {
 #ifdef PADDLE_WITH_PSLIB
   auto& regions = _regions[tid];
   regions.clear();
   regions.resize(var_names.size());
   for (auto i = 0u; i < var_names.size(); ++i) {
-    Variable* var = scope.FindVar(var_names[i]);
+    std::string varname = var_names[i];
+    if (!in_cpu) {
+      varname = var_names[i] + "pin";
+    }
+    Variable* var = scope.FindVar(varname);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     float* w = tensor->data<float>();
     paddle::ps::Region reg(w, tensor->numel());
@@ -485,6 +702,57 @@ void FleetWrapper::PushDenseVarsSync(
     Scope* scope, const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
+#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<::std::future<int32_t>>* push_sparse_status,
+    float scale_datanorm, int batch_size, const paddle::platform::Place& place,
+    cudaStream_t stream, cudaEvent_t event) {
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g_data = tensor->data<float>();
+
+    Variable* pin_var = scope.FindVar(t + "pin");
+    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+    float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
+                                                   platform::CUDAPinnedPlace());
+    memory::Copy(platform::CUDAPinnedPlace(), pin_g,
+                 BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
+                 sizeof(float) * count, stream);
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+    cudaEventSynchronize(event);
+
+    float* g = pin_g;
+    if (scale_datanorm >= 0) {
+      if (t.find(".batch_size@GRAD") != std::string::npos ||
+          t.find(".batch_sum@GRAD") != std::string::npos) {
+        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
+        float scale = 1.0 / batch_size;
+        mat *= scale;
+      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
+        VLOG(3) << "epsilon: " << scale_datanorm;
+        for (int i = 0; i < count; ++i) {
+          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
+                 batch_size * scale_datanorm;
+        }
+      }
+    }
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
+
+  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  if (push_sparse_status) {
+    push_sparse_status->push_back(std::move(status));
+  }
+}
+
+#endif
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope, const uint64_t table_id,
     const std::vector<std::string>& var_names,
@@ -1085,8 +1353,8 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
   push_status.wait();
   auto status = push_status.get();
   if (status != 0) {
-    PADDLE_THORW(platform::errors::Fatal(
-        "push shrink dense param failed, status is [%d].", status));
+    // PADDLE_THORW(platform::errors::Fatal(
+    //    "push shrink dense param failed, status is [%d].", status));
     sleep(sleep_seconds_before_fail_exit_);
     exit(-1);
   }
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 4c0564f87d48b4ba06bdb75e94fbb80b3d4f448a..92f3a625a755bba4989033c0cd41d9b25591c960 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/heter_service.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -80,6 +81,24 @@ class FleetWrapper {
     pull_local_thread_num_ = thread_num;
   }
 
+#ifdef PADDLE_WITH_PSLIB
+  void HeterPullSparseVars(int workerid, std::shared_ptr<HeterTask> task,
+                           const uint64_t table_id,
+                           const std::vector<std::string>& var_names,
+                           int fea_dim,
+                           const std::vector<std::string>& var_emb_names);
+
+  void HeterPushSparseVars(
+      std::shared_ptr<HeterTask> task, const uint64_t table_id,
+      const std::vector<std::string>& sparse_key_names,
+      const std::vector<std::string>& sparse_grad_names, const int emb_dim,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      const bool use_cvm, const bool dump_slot, const bool no_cvm);
+#endif
+
+  typedef std::function<void(int, int)> HeterCallBackFunc;
+  int RegisterHeterCallback(HeterCallBackFunc handler);
+
   // Pull sparse variables from server in sync mode
   // Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names
   // Param<out>: fea_values
@@ -118,15 +137,24 @@ class FleetWrapper {
   void PullDenseVarsAsync(
       const Scope& scope, const uint64_t table_id,
       const std::vector<std::string>& var_names,
-      std::vector<::std::future<int32_t>>* pull_dense_status);
+      std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu);
 
   // push dense parameters(not gradients) to server in sync mode
   void PushDenseParamSync(const Scope& scope, const uint64_t table_id,
                           const std::vector<std::string>& var_names);
 
-  // Push dense variables to server in async mode
-  // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
-  // Param<out>: push_sparse_status
+// Push dense variables to server in async mode
+// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
+// Param<out>: push_sparse_status
+#ifdef PADDLE_WITH_CUDA
+  void PushDenseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      float scale_datanorm, int batch_size,
+      const paddle::platform::Place& place, cudaStream_t stream,
+      cudaEvent_t event);
+#endif
   void PushDenseVarsAsync(
       const Scope& scope, const uint64_t table_id,
       const std::vector<std::string>& var_names,
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index 45512d6adc49c577c2332ae2ef2c42408138990e..bb958f1ac015bfd1a71b3ccd530406a33e4e37cb 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -54,9 +54,8 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
       paddle::framework::fs_remove(tmp);
       if (i == retry_times_) {
         VLOG(0) << "fs_open_write failed, retry times reaches limit";
-        PADDLE_THROW(platform::errors::PreconditionNotMet(
-            "fs_open_write failed, retry times reaches"
-            " limit ",
+        PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+            "fs_open_write failed, retry times reaches %d limit.",
             retry_times_));
       }
     } else {
@@ -143,7 +142,7 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
           break;
         }
       }
-      PADDLE_THROW(platform::errors::ExecutionTimeout(
+      PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
           "TIMEOUT self_rank = %d pair_rank = %d", self_rank_,
           last_check_rank));
     }
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index 3f932ee226ca85b86c92fef1b30420d782d9bc62..758cde78530d7b334a7100bce6ce32c2869cc066 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -105,6 +105,11 @@ enum GlooStoreType { HDFS, HTTP };
 
 class GlooWrapper {
  public:
+  static std::shared_ptr<GlooWrapper> GetInstance() {
+    static auto s_instance = std::make_shared<GlooWrapper>();
+    return s_instance;
+  }
+
   GlooWrapper() {}
 
   virtual ~GlooWrapper() {}
@@ -153,6 +158,11 @@ class GlooWrapper {
 #endif
   }
 
+  bool IsInitialized() { return is_initialized_; }
+#ifdef PADDLE_WITH_GLOO
+  std::shared_ptr<gloo::Context> GetContext() { return context_; }
+#endif
+
   template <typename T>
   std::vector<T> AllReduce(std::vector<T>& sendbuf,            // NOLINT
                            const std::string& mode = "sum") {  // NOLINT
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b70d5e5fc1ae6c90dac4ebf1d86353e38a79492d
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<HeterWrapper> HeterWrapper::s_instance_ = NULL;
+bool HeterWrapper::is_initialized_ = false;
+
+void HeterWrapper::CreateClient2XpuConnection() {
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connection_type = "single";
+  options.timeout_ms = 2000000;
+
+  xpu_channels_.resize(xpu_list_.size());
+  for (size_t i = 0; i < xpu_list_.size(); ++i) {
+    VLOG(3) << "channel init: " << xpu_list_[i];
+    xpu_channels_[i].reset(new brpc::Channel());
+    if (xpu_channels_[i]->Init(xpu_list_[i].c_str(), "", &options) != 0) {
+      VLOG(0) << "server channel init fail";
+    }
+  }
+}
+
+void HeterWrapper::RegisterServiceHandler(int cmd, HeterServiceHandler func) {
+  service_.RegisterServiceHandler(cmd, func);
+}
+
+void HeterWrapper::SetXpuList(const std::vector<std::string>& xpu_list) {
+#ifdef PADDLE_WITH_PSLIB
+  VLOG(3) << "Going to set xpu list";
+  for (auto& x : xpu_list) {
+    xpu_list_.push_back(x);
+    VLOG(3) << "set xpu list:  " << x << " size: " << xpu_list_.size();
+  }
+#endif
+}
+
+void HeterWrapper::StartXpuService(const std::string& ip, uint32_t port) {
+  std::string ip_port = ip + ":" + std::to_string(port);
+  VLOG(3) << "xpu server starts at " << ip_port;
+
+  server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
+  brpc::ServerOptions options;
+  if (server_.Start(ip_port.c_str(), &options) != 0) {
+    VLOG(0) << "xpu server start fail";
+  }
+}
+
+// void HeterWrapper::SerializeToReq(const std::string& varname,
+// Scope* scope, HeterRequest& request) {
+//  auto* req_var = request.mutable_vars();
+
+void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
+                                  VariableMessage* req_var) {
+  Variable* var = scope->FindVar(varname);
+  if (var == nullptr) {
+    return;
+  }
+  LoDTensor* tensor = var->GetMutable<LoDTensor>();
+  req_var->set_varname(varname);
+  req_var->set_type(LOD_TENSOR);
+  req_var->set_data_type(static_cast<VariableMessage::Type>(tensor->type()));
+
+  for (auto& dim : framework::vectorize(tensor->dims())) {
+    req_var->add_dims(dim);
+  }
+  const framework::LoD lod = tensor->lod();
+  if (lod.size() > 0) {
+    req_var->set_lod_level(lod.size());
+    for (auto& each : lod) {
+      VariableMessage::LodData* lod_inner = req_var->add_lod();
+      for (auto& d : each) {
+        lod_inner->add_lod_data(d);
+      }
+    }
+  }
+
+  auto* req_data = req_var->mutable_data();
+  req_data->clear();
+  req_data->resize(tensor->numel() * SizeOfType(tensor->type()));
+  char* data_ptr = const_cast<char*>(req_data->data());
+
+  if (platform::is_cpu_place(tensor->place())) {
+    memcpy(data_ptr, tensor->data<void>(),
+           tensor->numel() * SizeOfType(tensor->type()));
+  }
+#ifdef PADDLE_WITH_CUDA
+  else {
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * SizeOfType(tensor->type()), nullptr);
+  }
+#endif
+}
+
+// void HeterWrapper::DeSerializeToTensor(Scope* scope,
+// const HeterRequest* request) {
+#ifdef PADDLE_WITH_CUDA
+void HeterWrapper::DeSerializeToTensor(Scope* scope,
+                                       const VariableMessage& req_var,
+                                       platform::Place place,
+                                       cudaStream_t stream) {
+#else
+void HeterWrapper::DeSerializeToTensor(Scope* scope,
+                                       const VariableMessage& req_var,
+                                       platform::Place place) {
+#endif
+  // const VariableMessage& req_var = request->vars();
+  auto* var = scope->FindVar(req_var.varname());
+  auto* tensor = var->GetMutable<LoDTensor>();
+
+  std::vector<int> vec_dim;
+  for (auto& x : req_var.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(make_ddim(vec_dim));
+
+  LoD lod;
+  for (int i = 0; i < req_var.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
+      v.push_back(req_var.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(place, ToVarType(req_var.data_type()));
+
+#ifdef PADDLE_WITH_CUDA
+  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+               platform::CPUPlace(), req_var.data().data(),
+               tensor->numel() * SizeOfType(tensor->type()), stream);
+#else
+  memcpy(tensor_data, req_var.data().data(),
+         tensor->numel() * SizeOfType(tensor->type()));
+#endif
+}
+
+framework::proto::VarType::Type HeterWrapper::ToVarType(
+    VariableMessage::Type type) {
+  switch (type) {
+    case VariableMessage::FP32:
+      return framework::proto::VarType::FP32;  // NOLINT
+    case VariableMessage::FP64:
+      return framework::proto::VarType::FP64;  // NOLINT
+    case VariableMessage::INT32:
+      return framework::proto::VarType::INT32;  // NOLINT
+    case VariableMessage::INT64:
+      return framework::proto::VarType::INT64;  // NOLINT
+    case VariableMessage::BOOL:
+      return framework::proto::VarType::BOOL;  // NOLINT
+    default:
+      VLOG(0) << "Not support type " << type;
+  }
+}
+
+void HeterWrapper::StopXpuService(int num) {
+  HeterRequest request;
+  HeterResponse response;
+  brpc::Controller cntl;
+  request.set_cmd(2);
+  // for (size_t i = 0; i < xpu_channels_.size(); ++i) {
+  HeterService_Stub stub(xpu_channels_[num].get());
+  stub.service(&cntl, &request, &response, NULL);
+  if (cntl.Failed()) {
+    VLOG(0) << "call stop xpu service fail: " << cntl.ErrorText();
+  } else {
+    VLOG(3) << "call stop xpu service success";
+  }
+  // }
+}
+
+void HeterWrapper::EndPass(Scope* scope, int num) {
+  HeterRequest request;
+  HeterResponse response;
+  brpc::Controller cntl;
+  request.set_cmd(1);
+  // for (size_t i = 0; i < xpu_channels_.size(); ++i) {
+  HeterService_Stub stub(xpu_channels_[num].get());
+  stub.service(&cntl, &request, &response, NULL);
+  if (cntl.Failed()) {
+    VLOG(0) << "call end pass fail: " << cntl.ErrorText();
+  } else {
+    VLOG(3) << "call end pass success";
+    for (int j = 0; j < response.vars_size(); ++j) {
+      DeSerializeToTensor(scope, response.vars(j), platform::CPUPlace());
+    }
+  }
+  // }
+}
+
+void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
+                                 HeterCpuWorker* worker, int mpi_rank,
+                                 std::vector<std::string>& send_vars) {
+  HeterRequest request;
+  request.set_cmd(0);
+  request.set_cur_batch(task->cur_batch_);
+
+  OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
+    auto* closure = (OnHeterRpcDone*)done;
+    if (closure->cntl.Failed()) {
+      VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
+    } else {
+      VLOG(3) << "call xpu success";
+    }
+    // DeSerializeToTensor(task->scope_,
+    // closure->response.vars(), platform::CPUPlace());
+    for (int i = 0; i < closure->response.vars_size(); ++i) {
+      DeSerializeToTensor(task->scope_, closure->response.vars(i),
+                          platform::CPUPlace());
+    }
+
+    worker->Schedule(task->taskid_);
+  });
+
+  //  std::vector<std::string> varnames = {"click", "12345"};
+  //  //varnames.push_back(send_var);
+  //  //if (send_var == "_generated_var_412") {
+  //  varnames.push_back("filter_by_instag_0.tmp_0");
+  //  varnames.push_back("filter_by_instag_2.tmp_0");
+  //  varnames.push_back("filter_by_instag_0.tmp_1");
+  //  varnames.push_back("concat_1.tmp_0");
+  // }
+  for (auto& varname : send_vars) {
+    auto* req_var = request.add_vars();
+    SerializeToReq(varname, task->scope_, req_var);
+  }
+
+  int num = mpi_rank % xpu_channels_.size();
+  HeterService_Stub stub(xpu_channels_[num].get());
+  // stub.service(&cntl, &request, &response,
+  // brpc::NewCallback(&HeterWrapper::RpcCallBack,
+  // response, cntl, worker, task));
+  stub.service(&done->cntl, &request, &done->response, done);
+}
+
+void HeterWrapper::CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
+                                     HeterCpuWorker* worker, int mpi_rank,
+                                     std::vector<std::string>& send_vars) {
+  HeterRequest request;
+  HeterResponse response;
+  brpc::Controller cntl;
+  request.set_cmd(0);
+  request.set_cur_batch(task->cur_batch_);
+
+  // std::vector<std::string> varnames = {"concat_1.tmp_0", "click", "12345"};
+  for (auto& varname : send_vars) {
+    auto* req_var = request.add_vars();
+    SerializeToReq(varname, task->scope_, req_var);
+  }
+
+  HeterService_Stub stub(xpu_channels_[0].get());
+  stub.service(&cntl, &request, &response, NULL);
+  if (cntl.Failed()) {
+    VLOG(0) << "call xpu fail: " << cntl.ErrorText();
+  } else {
+    VLOG(3) << "call xpu success";
+    for (int i = 0; i < response.vars_size(); ++i) {
+      DeSerializeToTensor(task->scope_, response.vars(i), platform::CPUPlace());
+    }
+  }
+}
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ba4e00fc851b1f1b6e10764b5bc069f515d4b47
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -0,0 +1,123 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#ifdef PADDLE_WITH_PSLIB
+#include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+
+class HeterCpuWorker;
+
+typedef std::function<void(void*)> HeterRpcCallbackFunc;
+
+class OnHeterRpcDone : public google::protobuf::Closure {
+ public:
+  OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
+  virtual ~OnHeterRpcDone() {}
+  void Run() {
+    std::unique_ptr<OnHeterRpcDone> self_guard(this);
+    handler_(this);
+  }
+
+  HeterRpcCallbackFunc handler_;
+  HeterResponse response;
+  brpc::Controller cntl;
+};
+
+class HeterWrapper {
+ public:
+  virtual ~HeterWrapper() {
+    server_.Stop(1000);
+    server_.Join();
+  }
+
+  HeterWrapper() {}
+
+  static void HeterRpcCallBack(HeterResponse* response, brpc::Controller* cntl,
+                               HeterCpuWorker* worker,
+                               std::shared_ptr<HeterTask> task);
+
+  void CreateClient2XpuConnection();
+
+  void RegisterServiceHandler(int cmd, HeterServiceHandler func);
+
+  void StartXpuService(const std::string& ip, uint32_t port);
+
+  void CallRemoteXpu(std::shared_ptr<HeterTask> task, HeterCpuWorker* worker,
+                     int mpi_rank, std::vector<std::string>& send_vars);
+
+  void CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
+                         HeterCpuWorker* worker, int mpi_rank,
+                         std::vector<std::string>& send_vars);
+
+  void StopXpuService(int num);
+
+  void EndPass(Scope* scope, int num);
+
+  void SerializeToReq(const std::string& varname, Scope* scope,
+                      VariableMessage* req_var);
+
+  framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
+
+#ifdef PADDLE_WITH_CUDA
+  void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
+                           platform::Place place,
+                           cudaStream_t stream = nullptr);
+#else
+  void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
+                           platform::Place place);
+#endif
+  // HeterWrapper singleton
+  static std::shared_ptr<HeterWrapper> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::framework::HeterWrapper());
+    }
+    return s_instance_;
+  }
+
+  std::vector<std::string>& GetXpuList() { return xpu_list_; }
+
+  void SetXpuList(const std::vector<std::string>& xpu_list);
+
+ private:
+  static std::shared_ptr<HeterWrapper> s_instance_;
+
+ protected:
+  std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
+  brpc::Server server_;
+  HeterXpuService service_;
+  static bool is_initialized_;
+  DISABLE_COPY_AND_ASSIGN(HeterWrapper);
+  std::vector<std::string> xpu_list_;
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index ac892443de36cf6d37d56da761fb3d60628a5e4a..f69ada080676cddfa4f31c6cbc450b8eca28b3ac 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
   callback();
 }
 
+#ifdef PADDLE_WITH_XPU
+XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 2212122c03de3416c91fcc46bf510bbc02d4302e..4f7739652822b9047b1798b6bd66261effbe2f49 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
   void ClearCallback(const std::function<void()> &callback) override;
 };
 
+#ifdef PADDLE_WITH_XPU
+class XPUGarbageCollector : public GarbageCollector {
+ public:
+  XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d00e38784c2c0415a59a33fc24d708c253481c21
--- /dev/null
+++ b/paddle/fluid/framework/generator.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
+
+GeneratorState* Generator::GetState() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_.get();
+}
+
+void Generator::SetState(GeneratorState* state_in) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  *this->state_ = *state_in;
+}
+
+uint64_t Generator::GetCurrentSeed() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->current_seed;
+}
+
+uint64_t Generator::Seed() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  uint64_t seed;
+  std::random_device de;
+  seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
+  this->state_->current_seed = seed;
+  std::seed_seq seq({seed});
+  this->state_->cpu_engine.seed(seq);
+
+  return this->state_->current_seed;
+}
+
+void Generator::SetCurrentSeed(uint64_t seed) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  this->state_->current_seed = uint64_t(seed);
+  std::seed_seq seq({seed});
+  this->state_->cpu_engine.seed(seq);
+}
+
+std::mt19937_64& Generator::GetCPUEngine() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->cpu_engine;
+}
+
+void Generator::SetCPUEngine(std::mt19937_64 engine) {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  this->state_->cpu_engine = std::mt19937_64(engine);
+}
+
+uint64_t Generator::Random64() {
+  std::lock_guard<std::mutex> lock(this->mutex);
+  return this->state_->cpu_engine();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..17870782ba72a3247de734642962ffec48c0c91e
--- /dev/null
+++ b/paddle/fluid/framework/generator.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <deque>
+#include <iostream>  // temp for debug
+#include <memory>
+#include <mutex>  // NOLINT
+#include <random>
+#include <typeinfo>
+#include <utility>
+
+namespace paddle {
+namespace framework {
+
+struct GeneratorState {
+  int64_t device = -1;
+  uint64_t current_seed = 34342423252;
+  std::mt19937_64 cpu_engine;
+};
+
+struct Generator {
+  Generator() {
+    GeneratorState default_gen_state_cpu;
+    default_gen_state_cpu.device = -1;
+    default_gen_state_cpu.current_seed = 34342423252;
+    std::seed_seq seq({34342423252});
+    default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
+    this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
+  }
+  explicit Generator(GeneratorState state_in)
+      : state_{std::make_shared<GeneratorState>(state_in)} {}
+  Generator(const Generator& other)
+      : Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
+
+  // get random state
+  GeneratorState* GetState();
+  // set random state
+  void SetState(GeneratorState* state_in);
+  // get current seed
+  uint64_t GetCurrentSeed();
+  // random a seed and get
+  uint64_t Seed();
+
+  // set seed
+  void SetCurrentSeed(uint64_t seed);
+  // get cpu engine
+  std::mt19937_64& GetCPUEngine();
+  // set cpu engine
+  void SetCPUEngine(std::mt19937_64 engine);
+
+  uint64_t Random64();
+
+  bool is_init_py = false;
+
+  // CPU Generator singleton
+  static std::shared_ptr<Generator> GetInstance() {
+    if (NULL == gen_instance_) {
+      gen_instance_.reset(new paddle::framework::Generator());
+    }
+    return gen_instance_;
+  }
+
+  static std::shared_ptr<Generator> GetInstanceX() {
+    if (NULL == gen_instance_) {
+      gen_instance_.reset(new paddle::framework::Generator());
+    }
+    gen_instance_->is_init_py = true;
+    return gen_instance_;
+  }
+
+ private:
+  static std::shared_ptr<Generator> gen_instance_;
+  std::shared_ptr<GeneratorState> state_;
+  mutable std::mutex mutex;
+
+  Generator(const Generator& other, const std::lock_guard<std::mutex>&)
+      : state_(std::make_shared<GeneratorState>(*(other.state_))) {}
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..8662e460aa340eab834ecb1721fc4708c7220b29
--- /dev/null
+++ b/paddle/fluid/framework/heter_service.h
@@ -0,0 +1,368 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>         // NOLINT
+#include <unordered_map>  // NOLINT
+#include <unordered_set>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/heter_service.pb.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_PSLIB
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+
+namespace paddle {
+namespace framework {
+
+typedef std::function<int(const HeterRequest*, HeterResponse*)>
+    HeterServiceHandler;
+class DataFeed;
+
+class HeterXpuService : public HeterService {
+ public:
+  HeterXpuService() {}
+  virtual ~HeterXpuService() {}
+  void service(::google::protobuf::RpcController* controller,
+               const HeterRequest* request, HeterResponse* response,
+               ::google::protobuf::Closure* done) {
+    brpc::ClosureGuard done_guard(done);
+    int ret = 0;
+    int cmd = request->cmd();
+    auto itr = handler_map_.find(cmd);
+    if (itr == handler_map_.end()) {
+    } else {
+      ret = itr->second(request, response);
+    }
+    // response->set_err_code(0);
+    // response->set_err_msg("");
+    if (ret != 0) {
+      // response->set_err_code(-1);
+      // response->set_err_msg("xpu service error");
+    }
+  }
+
+  void RegisterServiceHandler(int cmd, HeterServiceHandler func) {
+    VLOG(0) << "register heter service";
+    handler_map_[cmd] = func;
+  }
+
+ private:
+  std::unordered_map<int, HeterServiceHandler> handler_map_;
+};
+
+enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
+
+class HeterTask {
+ public:
+  void Update() {
+    if (state_ == PULL_SPARSE) {
+      state_ = OP_RUN;
+    } else if (state_ == OP_RUN) {
+      state_ = XPU;
+      // state_ = PUSH_GRAD;
+      // state_ = PUSH_GRAD;
+    } else if (state_ == XPU) {
+      state_ = OP_RUN_END;
+    } else if (state_ == OP_RUN_END) {
+      state_ = PUSH_GRAD;
+    } else if (state_ == PUSH_GRAD) {
+      state_ = DONE;
+    }
+  }
+  void Reset() {
+    total_time = 0;
+    read_time = 0;
+    pack_time = 0;
+    pull_sparse_local_time = 0;
+    op_all_time = 0;
+    xpu_op_time = 0;
+    xpu_wait_time = 0;
+    cpu_op_time = 0;
+    collect_label_time = 0;
+    fill_sparse_time = 0;
+    push_sparse_time = 0;
+  }
+  void Show() {
+    std::cout << "features size " << features_.size() << std::endl;
+    for (size_t i = 0; i < features_.size(); ++i) {
+      std::cout << "features[" << i << "] size " << features_[i].size()
+                << std::endl;
+    }
+  }
+  void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
+                const ProgramDesc& program);
+
+  Scope* scope_{nullptr};
+  int taskid_;
+  int cur_batch_;
+  HeterTaskState state_;
+  // cache
+  std::map<uint64_t, std::vector<uint64_t>> features_;
+  std::map<uint64_t, std::vector<float>> feature_labels_;
+  std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
+  std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
+  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
+  double total_time{0};
+  double read_time{0};
+  double pack_time{0};
+  double pull_sparse_local_time{0};
+  double op_all_time{0};
+  double xpu_op_time{0};
+  double xpu_wait_time{0};
+  double cpu_op_time{0};
+  double collect_label_time{0};
+  double fill_sparse_time{0};
+  double push_sparse_time{0};
+};
+
+template <class T>
+class HeterObjectPool {
+ public:
+  HeterObjectPool() {}
+  virtual ~HeterObjectPool(){};
+  std::shared_ptr<T> Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (pool_.empty()) {
+      num_ += 1;
+#ifdef PADDLE_WITH_CUDA
+      VLOG(0) << "pool construct size: " << num_;
+#endif
+      return std::make_shared<T>();
+    } else {
+      auto ret = pool_.back();
+      pool_.pop_back();
+      return ret;
+    }
+  }
+  void Push(std::shared_ptr<T> data) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    pool_.push_back(std::move(data));
+  }
+  int Size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return pool_.size();
+  }
+  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
+
+ private:
+  std::vector<std::shared_ptr<T>> pool_;
+  std::mutex mutex_;
+  int num_{0};
+};
+
+struct BthreadMutextGuard {
+  BthreadMutextGuard(bthread_mutex_t* rho) {
+    mutex_ = rho;
+    bthread_mutex_lock(mutex_);
+  }
+  ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
+  bthread_mutex_t* mutex_;
+};
+
+template <class T>
+class BtObjectPool {
+ public:
+  BtObjectPool() {
+    bthread_mutex_init(&mutex_, NULL);
+    bthread_cond_init(&cond_, NULL);
+  }
+
+  virtual ~BtObjectPool() {
+    bthread_cond_destroy(&cond_);
+    bthread_mutex_destroy(&mutex_);
+  };
+
+  std::shared_ptr<T> Get() {
+    BthreadMutextGuard guard(&mutex_);
+    while (pool_.empty()) {
+      bthread_cond_wait(&cond_, &mutex_);
+    }
+    auto ret = pool_.back();
+    pool_.pop_back();
+    return ret;
+  }
+
+  void Push(std::shared_ptr<T> data) {
+    BthreadMutextGuard guard(&mutex_);
+    pool_.push_back(std::move(data));
+    bthread_cond_signal(&cond_);
+  }
+
+  int Size() { return pool_.size(); }
+
+  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
+
+ private:
+  std::vector<std::shared_ptr<T>> pool_;
+  bthread_mutex_t mutex_;
+  bthread_cond_t cond_;
+  int num_{0};
+};
+
+template <class K, class T>
+struct HeterNode {
+  K key;
+  T value;
+  HeterNode* prev;
+  HeterNode* next;
+};
+
+template <class K, class T>
+class HeterList {
+ public:
+  HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
+    head_->prev = NULL;
+    head_->next = tail_;
+    tail_->prev = head_;
+    tail_->next = NULL;
+    size = 0;
+    cap_ = 1e9;
+  }
+
+  ~HeterList() {
+    delete head_;
+    delete tail_;
+  }
+
+  void SetCap(int num) { cap_ = num; }
+
+  bool TryPut(K& key, T& value) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this] { return size < cap_; });
+    if (task_map_.find(key) != task_map_.end()) {
+      // std::cout << "try put key=" << key << " false" << std::endl;
+      task_map_.erase(key);
+      return false;
+    } else {
+      HeterNode<K, T>* node = new HeterNode<K, T>;
+      node->key = key;
+      node->value = value;
+      map_[node->key] = node;
+      attach(node);
+      // std::cout << "try put key=" << key << " true" << std::endl;
+      return true;
+    }
+  }
+
+  bool Put(K& key, T& value) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this] { return size < cap_; });
+    HeterNode<K, T>* node = new HeterNode<K, T>;
+    // std::cout << "put key=" << key << " true" << std::endl;
+    node->key = key;
+    node->value = value;
+    map_[node->key] = node;
+    attach(node);
+    return true;
+  }
+
+  T TryGet(const K& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto iter = map_.find(key);
+    if (iter != map_.end()) {
+      // std::cout << "try get key=" << key << " true" << std::endl;
+      HeterNode<K, T>* node = iter->second;
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(key);
+      delete node;
+      return ret;
+    }
+    task_map_.insert(key);
+    // std::cout << "try get key=" << key << " false" << std::endl;
+    return nullptr;
+  }
+
+  T Get(const K& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto iter = map_.find(key);
+    if (iter != map_.end()) {
+      // std::cout << "get key=" << key << " true" << std::endl;
+      HeterNode<K, T>* node = iter->second;
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(key);
+      delete node;
+      return ret;
+    }
+    // std::cout << "get key=" << key << " false" << std::endl;
+    return nullptr;
+  }
+
+  T Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    HeterNode<K, T>* node = head_->next;
+    if (node == tail_) {
+      // std::cout << "get2 false" << std::endl;
+      return nullptr;
+    } else {
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(node->key);
+      // std::cout << "get2 key=" << node->key << " true" << std::endl;
+      delete node;
+      return ret;
+    }
+  }
+
+  bool Empty() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return head_->next == tail_;
+  }
+
+  int Size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return size;
+  }
+
+ private:
+  void detach(HeterNode<K, T>* node) {
+    node->prev->next = node->next;
+    node->next->prev = node->prev;
+    size--;
+  }
+
+  void attach(HeterNode<K, T>* node) {
+    node->prev = head_;
+    node->next = head_->next;
+    head_->next->prev = node;
+    head_->next = node;
+    size++;
+  }
+
+ private:
+  HeterNode<K, T>* head_;
+  HeterNode<K, T>* tail_;
+  std::unordered_map<K, HeterNode<K, T>*> map_;
+  std::unordered_set<K> task_map_;
+  std::mutex mutex_;
+  std::condition_variable cond_;
+  int cap_;
+  int size;
+};
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/heter_service.proto b/paddle/fluid/framework/heter_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c8c9ec80b3fa85e503eb5c243b8933af914b1017
--- /dev/null
+++ b/paddle/fluid/framework/heter_service.proto
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle.framework;
+option cc_generic_services = true;
+
+// It can be: LoDTensor、SelectedRows or NCCL_ID
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+  NCCL_ID = 2;
+}
+
+// VariableMessage is serialized paddle variable message.
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
+message VariableMessage {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+  }
+
+  message LodData { repeated int64 lod_data = 1; }
+  optional string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  optional VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
+  optional Type data_type = 3;
+  repeated int64 dims = 4;
+
+  // lod details:
+  optional int64 lod_level = 5;
+  repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  optional int64 slr_height = 7;
+  // tensor data
+  optional bytes data = 8;
+}
+message HeterRequest {
+  required int32 cmd = 1;
+  optional int32 cur_batch = 2;
+  repeated VariableMessage vars = 3;
+};
+
+message HeterResponse {
+  // optional VariableMessage vars = 1;
+  repeated VariableMessage vars = 1;
+};
+
+service HeterService { rpc service(HeterRequest) returns (HeterResponse); };
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83838f4df67d0bcbd9fb8ec8fb6762641287d2c4
--- /dev/null
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -0,0 +1,1166 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
+#ifdef PADDLE_WITH_PSLIB
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+
+namespace paddle {
+namespace framework {
+
+void HeterTask::PackTask(Scope* thread_scope, int taskid, DataFeed* reader,
+                         int cur_batch, const ProgramDesc& program) {
+  // total_time = 0;
+  // read_time = 0;
+  // pack_time = 0;
+  // pull_sparse_local_time = 0;
+  taskid_ = taskid;
+  auto& block = program.Block(0);
+  if (!scope_) {
+    scope_ = &(thread_scope->NewScope());
+    for (auto& var : block.AllVars()) {
+      if (!var->Persistable()) {
+        auto* ptr = scope_->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+      }
+    }
+  }
+  state_ = PULL_SPARSE;
+  cur_batch_ = cur_batch;
+  auto& use_slots = reader->GetUseSlotAlias();
+  for (size_t i = 0; i < use_slots.size(); ++i) {
+    Variable* thread_var = thread_scope->FindVar(use_slots[i]);
+    LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
+    Variable* task_var = scope_->FindVar(use_slots[i]);
+    LoDTensor* task_tensor = task_var->GetMutable<LoDTensor>();
+    TensorCopy(*thread_tensor, platform::CPUPlace(), task_tensor);
+    auto& tensor_lod = thread_tensor->lod()[0];
+    LoD thread_lod{tensor_lod};
+    task_tensor->set_lod(thread_lod);
+  }
+}
+
+void HeterCpuWorker::GetXpuOpIndex() {
+  xpu_begin_op_index_ = trainer_desc_.xpu_start_idx();
+  xpu_end_op_index_ = trainer_desc_.xpu_end_idx();
+  VLOG(0) << "xpu begin: " << xpu_begin_op_index_
+          << " xpu end: " << xpu_end_op_index_;
+  // CHECK(xpu_begin_op_index_ == trainer_desc_.xpu_start_idx());
+  // CHECK(xpu_end_op_index_ == trainer_desc_.xpu_end_idx());
+  // CHECK(trainer_desc_.op_run_start_idx() == 0);
+  // CHECK(trainer_desc_.op_run_end_idx() == xpu_begin_op_index_ - 1);
+  // CHECK(trainer_desc_.op_run_end_start_idx() == xpu_end_op_index_ + 1);
+  // CHECK(trainer_desc_.op_run_end_end_idx() == ops_.size() - 1);
+}
+
+void HeterCpuWorker::Schedule(int taskid) {
+  VLOG(3) << "schedule " << taskid;
+  auto task = wait_queue_.TryGet(taskid);
+  if (task) {
+    run_queue_.Put(task->taskid_, task);
+  }
+}
+
+void HeterCpuWorker::JumpContext(std::shared_ptr<HeterTask> task) {
+  VLOG(3) << "jump context " << task->taskid_;
+  if (!(wait_queue_.TryPut(task->taskid_, task))) {
+    run_queue_.Put(task->taskid_, task);
+  }
+}
+
+void HeterCpuWorker::Initialize(const TrainerDesc& desc) {
+  param_ = desc.downpour_param();
+  mpi_rank_ = desc.mpi_rank();
+  trainer_desc_ = desc;
+  for (int i = 0; i < param_.sparse_table_size(); ++i) {
+    uint64_t table_id =
+        static_cast<uint64_t>(param_.sparse_table(i).table_id());
+    TableParameter table = param_.sparse_table(i);
+    sparse_key_names_[table_id].resize(table.sparse_key_name_size());
+    for (int j = 0; j < table.sparse_key_name_size(); ++j) {
+      sparse_key_names_[table_id][j] = table.sparse_key_name(j);
+    }
+    sparse_value_names_[table_id].resize(table.sparse_value_name_size());
+    for (int j = 0; j < table.sparse_value_name_size(); ++j) {
+      sparse_value_names_[table_id][j] = table.sparse_value_name(j);
+    }
+    sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
+    for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
+      sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
+    }
+    label_var_name_[table_id] = table.label_var_name();
+    sparse_push_keys_[table_id] = std::vector<uint64_t>();
+  }
+
+  for (int i = 0; i < param_.dense_table_size(); ++i) {
+    uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
+    auto table = param_.dense_table(i);
+    dense_value_names_[table_id].resize(table.dense_value_name_size());
+    for (int j = 0; j < table.dense_value_name_size(); ++j) {
+      dense_value_names_[table_id][j] = table.dense_value_name(j);
+    }
+    dense_grad_names_[table_id].resize(table.dense_grad_name_size());
+    for (int j = 0; j < table.dense_grad_name_size(); ++j) {
+      dense_grad_names_[table_id][j] = table.dense_grad_name(j);
+    }
+  }
+
+  skip_ops_.resize(param_.skip_ops_size());
+  for (int i = 0; i < param_.skip_ops_size(); ++i) {
+    skip_ops_[i] = param_.skip_ops(i);
+  }
+  for (int i = 0; i < param_.stat_var_names_size(); ++i) {
+    stat_var_name_map_[param_.stat_var_names(i)] = 1;
+  }
+
+  need_to_push_sparse_ = param_.push_sparse();
+  need_to_push_dense_ = param_.push_dense();
+
+  fleet_ptr_ = FleetWrapper::GetInstance();
+  heter_ptr_ = HeterWrapper::GetInstance();
+  fetch_config_ = desc.fetch_config();
+  use_cvm_ = desc.use_cvm();
+  // for sparse value accessor, embedding only
+  no_cvm_ = desc.no_cvm();
+  scale_datanorm_ = desc.scale_datanorm();
+  dump_slot_ = desc.dump_slot();
+  dump_fields_.resize(desc.dump_fields_size());
+  for (int i = 0; i < desc.dump_fields_size(); ++i) {
+    dump_fields_[i] = desc.dump_fields(i);
+  }
+  adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
+  need_dump_param_ = false;
+  dump_param_.resize(desc.dump_param_size());
+  for (int i = 0; i < desc.dump_param_size(); ++i) {
+    dump_param_[i] = desc.dump_param(i);
+  }
+  if (desc.dump_param_size() != 0) {
+    need_dump_param_ = true;
+  }
+  for (int i = 0; i < desc.check_nan_var_names_size(); ++i) {
+    check_nan_var_names_.push_back(desc.check_nan_var_names(i));
+  }
+  copy_table_config_ = desc.copy_table_config();
+  for (int i = 0; i < copy_table_config_.src_sparse_tables_size(); ++i) {
+    uint64_t src_table = copy_table_config_.src_sparse_tables(i);
+    uint64_t dest_table = copy_table_config_.dest_sparse_tables(i);
+    VLOG(3) << "copy_sparse_tables_ push back " << src_table << "->"
+            << dest_table;
+    copy_sparse_tables_.push_back(std::make_pair(src_table, dest_table));
+  }
+  for (int i = 0; i < copy_table_config_.src_dense_tables_size(); ++i) {
+    uint64_t src_table = copy_table_config_.src_dense_tables(i);
+    uint64_t dest_table = copy_table_config_.dest_dense_tables(i);
+    VLOG(3) << "copy_dense_tables_ push back " << src_table << "->"
+            << dest_table;
+    copy_dense_tables_.push_back(std::make_pair(src_table, dest_table));
+  }
+  for (auto& m : copy_table_config_.table_denpendency_map()) {
+    if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
+      // currently only support one dependency
+      for (auto& value : m.values()) {
+        table_dependency_[m.key()] = value;
+      }
+    }
+  }
+}
+
+void HeterCpuWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
+  writer_.Reset(queue);
+}
+
+void HeterCpuWorker::SetNeedDump(bool need_dump_field) {
+  need_dump_field_ = need_dump_field;
+}
+
+// template <typename T>
+// std::string PrintLodTensorType(LoDTensor* tensor,
+//                                int64_t start, int64_t end) {
+//   auto count = tensor->numel();
+//   if (start < 0 || end > count) {
+//     VLOG(3) << "access violation";
+//     return "access violation";
+//   }
+//   std::ostringstream os;
+//   for (int64_t i = start; i < end; i++) {
+//     os << ":" << tensor->data<T>()[i];
+//   }
+//   return os.str();
+// }
+//
+// std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
+//                                   int64_t end) {
+//   auto count = tensor->numel();
+//   if (start < 0 || end > count) {
+//     VLOG(3) << "access violation";
+//     return "access violation";
+//   }
+//   std::ostringstream os;
+//   for (int64_t i = start; i < end; i++) {
+//     os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
+//   }
+//   return os.str();
+// }
+//
+// std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) {
+//   std::string out_val;
+//   if (tensor->type() == proto::VarType::FP32) {
+//     out_val = PrintLodTensorType<float>(tensor, start, end);
+//   } else if (tensor->type() == proto::VarType::INT64) {
+//     out_val = PrintLodTensorIntType(tensor, start, end);
+//   } else if (tensor->type() == proto::VarType::FP64) {
+//     out_val = PrintLodTensorType<double>(tensor, start, end);
+//   } else {
+//     out_val = "unsupported type";
+//   }
+//   return out_val;
+// }
+//
+// std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
+//   auto& dims = tensor->dims();
+//   if (tensor->lod().size() != 0) {
+//     auto& lod = tensor->lod()[0];
+//     return {lod[index] * dims[1], lod[index + 1] * dims[1]};
+//   } else {
+//     return {index * dims[1], (index + 1) * dims[1]};
+//   }
+// }
+//
+// bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) {
+//   auto& dims = tensor->dims();
+//   if (dims.size() != 2) return false;
+//   if (tensor->lod().size() != 0) {
+//     auto& lod = tensor->lod()[0];
+//     if (lod.size() != batch_size + 1) {
+//       return false;
+//     }
+//   } else {
+//     if (dims[0] != static_cast<int>(batch_size)) {
+//       return false;
+//     }
+//   }
+//   return true;
+// }
+
+void HeterCpuWorker::DumpParam() {
+  //  std::string os;
+  //  for (auto& param : dump_param_) {
+  //    os.clear();
+  //    os = param;
+  //    Variable* var = thread_scope_->FindVar(param);
+  //    if (var == nullptr) {
+  //      continue;
+  //    }
+  //    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+  //    int64_t len = tensor->numel();
+  //    os += PrintLodTensor(tensor, 0, len);
+  //    writer_ << os;
+  //  }
+}
+
+void HeterCpuWorker::CollectLabelInfo(std::shared_ptr<HeterTask> task,
+                                      size_t table_idx) {
+  if (no_cvm_) {
+    return;
+  }
+  uint64_t table_id = static_cast<uint64_t>(
+      param_.program_config(0).pull_sparse_table_id(table_idx));
+
+  TableParameter table;
+  for (auto i : param_.sparse_table()) {
+    if (i.table_id() == table_id) {
+      table = i;
+      break;
+    }
+  }
+  auto& feature = (task->features_)[table_id];
+  auto& feature_label = (task->feature_labels_)[table_id];
+  Scope* scope = task->scope_;
+  feature_label.resize(feature.size());
+  Variable* var = scope->FindVar(label_var_name_[table_id]);
+  LoDTensor* tensor = var->GetMutable<LoDTensor>();
+  int64_t* label_ptr = tensor->data<int64_t>();
+
+  size_t global_index = 0;
+  for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
+    VLOG(3) << "sparse_key_names_[" << i
+            << "]: " << sparse_key_names_[table_id][i];
+    Variable* fea_var = scope->FindVar(sparse_key_names_[table_id][i]);
+    if (fea_var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var "
+                             << sparse_key_names_[table_id][i] << " is null";
+
+    // skip slots which do not have embedding
+    Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]);
+    if (emb_var == nullptr) {
+      continue;
+    }
+    int64_t* ids = tensor->data<int64_t>();
+    size_t fea_idx = 0;
+    // tensor->lod()[0].size() == batch_size + 1
+    for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
+      for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
+        // should be skipped feasign defined in protobuf
+        if (ids[fea_idx] == 0u) {
+          continue;
+        }
+        feature_label[global_index++] =
+            static_cast<float>(label_ptr[lod_idx - 1]);
+      }
+    }
+  }
+  CHECK(global_index == feature.size())
+      << "expect fea info size:" << feature.size() << " real:" << global_index;
+}
+
+void HeterCpuWorker::FillSparseValue(std::shared_ptr<HeterTask> task,
+                                     size_t table_idx) {
+  uint64_t table_id = static_cast<uint64_t>(
+      param_.program_config(0).pull_sparse_table_id(table_idx));
+
+  TableParameter table;
+  for (auto i : param_.sparse_table()) {
+    if (i.table_id() == table_id) {
+      table = i;
+      break;
+    }
+  }
+
+  auto& fea_value = (task->feature_values_)[table_id];
+  Scope* scope = task->scope_;
+  auto fea_idx = 0u;
+
+  std::vector<float> init_value(table.fea_dim());
+  for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
+    std::string slot_name = sparse_key_names_[table_id][i];
+    std::string emb_slot_name = sparse_value_names_[table_id][i];
+    Variable* var = scope->FindVar(slot_name);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var " << slot_name << " is null";
+    int64_t* ids = tensor->data<int64_t>();
+    int len = tensor->numel();
+    Variable* var_emb = scope->FindVar(emb_slot_name);
+    if (var_emb == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
+    float* ptr =
+        tensor_emb->mutable_data<float>({len, table.emb_dim()}, place_);
+    // memset(ptr, 0, sizeof(float) * len * table.emb_dim());
+    auto& tensor_lod = tensor->lod()[0];
+    LoD data_lod{tensor_lod};
+    tensor_emb->set_lod(data_lod);
+
+    bool is_nid = (adjust_ins_weight_config_.need_adjust() &&
+                   adjust_ins_weight_config_.nid_slot() == emb_slot_name);
+    if (is_nid) {
+      nid_show_.clear();
+    }
+    int nid_ins_index = 0;
+
+    for (int index = 0; index < len; ++index) {
+      if (use_cvm_ || no_cvm_) {
+        if (ids[index] == 0u) {
+          memcpy(ptr + table.emb_dim() * index, init_value.data(),
+                 sizeof(float) * table.emb_dim());
+          if (is_nid) {
+            nid_show_.push_back(-1);
+            ++nid_ins_index;
+          }
+          continue;
+        }
+        memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(),
+               sizeof(float) * table.emb_dim());
+        if (is_nid &&
+            static_cast<size_t>(index) == tensor->lod()[0][nid_ins_index]) {
+          nid_show_.push_back(fea_value[fea_idx][0]);
+          ++nid_ins_index;
+        }
+        fea_idx++;
+      } else {
+        if (ids[index] == 0u) {
+          memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
+                 sizeof(float) * table.emb_dim());
+          if (is_nid) {
+            nid_show_.push_back(-1);
+            ++nid_ins_index;
+          }
+          continue;
+        }
+        memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2,
+               sizeof(float) * table.emb_dim());
+        if (is_nid &&
+            static_cast<size_t>(index) == tensor->lod()[0][nid_ins_index]) {
+          nid_show_.push_back(fea_value[fea_idx][0]);
+          ++nid_ins_index;
+        }
+        fea_idx++;
+      }
+    }
+  }
+}
+
+void HeterCpuWorker::AdjustInsWeight(std::shared_ptr<HeterTask> task) {
+#ifdef _LINUX
+  // check var and tensor not null
+  Scope* scope = task->scope_;
+  if (!adjust_ins_weight_config_.need_adjust()) {
+    VLOG(0) << "need_adjust=false, skip adjust ins weight";
+    return;
+  }
+  Variable* nid_var = scope->FindVar(adjust_ins_weight_config_.nid_slot());
+  if (nid_var == nullptr) {
+    VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot()
+            << " is nullptr, skip adjust ins weight";
+    return;
+  }
+  LoDTensor* nid_tensor = nid_var->GetMutable<LoDTensor>();
+  if (nid_tensor == nullptr) {
+    VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot()
+            << " is nullptr, skip adjust ins weight";
+    return;
+  }
+  Variable* ins_weight_var =
+      scope->FindVar(adjust_ins_weight_config_.ins_weight_slot());
+  if (ins_weight_var == nullptr) {
+    VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot()
+            << " is nullptr, skip adjust ins weight";
+    return;
+  }
+  LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable<LoDTensor>();
+  if (ins_weight_tensor == nullptr) {
+    VLOG(0) << "tensor of ins weight tensor "
+            << adjust_ins_weight_config_.ins_weight_slot()
+            << " is nullptr, skip adjust ins weight";
+    return;
+  }
+
+  float* ins_weights = ins_weight_tensor->data<float>();
+  size_t len = ins_weight_tensor->numel();  // len = batch size
+  // here we assume nid_show slot only has one feasign in each instance
+  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
+                                 << "nid_show size, " << len << " vs "
+                                 << nid_show_.size();
+  float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
+  float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
+  int64_t nid_adjw_num = 0;
+  double nid_adjw_weight = 0.0;
+  size_t ins_index = 0;
+  for (size_t i = 0; i < len; ++i) {
+    float nid_show = nid_show_[i];
+    VLOG(3) << "nid_show " << nid_show;
+    if (nid_show < 0) {
+      VLOG(3) << "nid_show < 0, continue";
+      continue;
+    }
+    float ins_weight = 1.0;
+    if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
+      ins_weight = log(M_E +
+                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
+                           nid_adjw_ratio);
+      // count nid adjw insnum and weight
+      ++nid_adjw_num;
+      nid_adjw_weight += ins_weight;
+      // choose large ins weight
+      VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin "
+              << ins_weights[ins_index];
+      if (ins_weight > ins_weights[ins_index]) {
+        VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight;
+        ins_weights[ins_index] = ins_weight;
+      }
+      ++ins_index;
+    }
+  }
+  VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num
+          << ", avg_adjw_weight: " << nid_adjw_weight;
+#endif
+}
+
+void HeterCpuWorker::CopySparseTable() {
+  for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) {
+    int64_t src_table = copy_sparse_tables_[i].first;
+    int64_t dest_table = copy_sparse_tables_[i].second;
+    int32_t feanum = 0;
+    if (src_table == dest_table) {
+      continue;
+    } else if (!copy_table_config_.sparse_copy_by_feasign()) {
+      if (feasign_set_.find(src_table) == feasign_set_.end()) {
+        continue;
+      } else if (feasign_set_[src_table].size() == 0) {
+        continue;
+      }
+      feanum = fleet_ptr_->CopyTable(src_table, dest_table);
+    } else {
+      std::vector<uint64_t> fea_vec(feasign_set_[src_table].begin(),
+                                    feasign_set_[src_table].end());
+      feanum = fleet_ptr_->CopyTableByFeasign(src_table, dest_table, fea_vec);
+      fea_vec.clear();
+      std::vector<uint64_t>().swap(fea_vec);
+    }
+    VLOG(3) << "copy feasign from table " << src_table << " to table "
+            << dest_table << ", feasign num=" << feanum;
+    feasign_set_[src_table].clear();
+    std::unordered_set<uint64_t>().swap(feasign_set_[src_table]);
+  }
+  feasign_set_.clear();
+}
+
+void HeterCpuWorker::CopyDenseTable() {
+  if (thread_id_ != 0) {
+    return;
+  }
+  thread_local std::vector<std::future<int32_t>> pull_dense_status;
+  for (size_t i = 0; i < copy_dense_tables_.size(); ++i) {
+    uint64_t src_table = copy_dense_tables_[i].first;
+    uint64_t dest_table = copy_dense_tables_[i].second;
+    if (src_table == dest_table) {
+      continue;
+    }
+    int32_t dim = fleet_ptr_->CopyTable(src_table, dest_table);
+    VLOG(3) << "copy param from table " << src_table << " to table "
+            << dest_table << ", dim=" << dim;
+    if (copy_table_config_.dense_pull_after_copy()) {
+      VLOG(3) << "dense pull after copy, table=" << dest_table;
+      pull_dense_status.resize(0);
+      // fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
+      //                                dense_value_names_[dest_table],
+      //                                &pull_dense_status);
+      for (auto& t : pull_dense_status) {
+        t.wait();
+        auto status = t.get();
+        if (status != 0) {
+          LOG(WARNING) << "pull dense after copy table failed,"
+                       << " table=" << dest_table;
+        }
+      }
+    }
+  }
+}
+
+void HeterCpuWorker::CopyDenseVars() {
+  if (thread_id_ != 0) {
+    return;
+  }
+  for (int i = 0; i < copy_table_config_.src_var_list_size(); ++i) {
+    auto& src_var_name = copy_table_config_.src_var_list(i);
+    auto& dest_var_name = copy_table_config_.dest_var_list(i);
+    if (src_var_name == dest_var_name) {
+      continue;
+    }
+    VLOG(3) << "copy dense var from " << src_var_name << " to "
+            << dest_var_name;
+    Variable* src_var = thread_scope_->FindVar(src_var_name);
+    CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
+    LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
+    CHECK(src_tensor != nullptr) << src_var_name
+                                 << " tensor is null";  // NOLINT
+    float* src_data = src_tensor->data<float>();
+
+    Variable* dest_var = thread_scope_->FindVar(dest_var_name);
+    CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
+    LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
+    CHECK(dest_tensor != nullptr) << dest_var_name
+                                  << " tensor is null";  // NOLINT
+    float* dest_data = dest_tensor->data<float>();
+
+    CHECK(src_tensor->numel() == dest_tensor->numel())
+        << "tensor numel not equal," << src_tensor->numel() << " vs "
+        << dest_tensor->numel();
+    for (int i = 0; i < src_tensor->numel(); i++) {
+      dest_data[i] = src_data[i];
+    }
+  }
+}
+
+void HeterCpuWorker::TrainFilesWithProfiler() {
+  VLOG(3) << "Begin to train files with profiler";
+  platform::SetNumThreads(1);
+  device_reader_->Start();
+
+  std::vector<double> op_total_time;
+  std::vector<std::string> op_name;
+  for (auto& op : ops_) {
+    bool need_skip = false;
+    for (auto t = 0u; t < skip_ops_.size(); ++t) {
+      if (op->Type().find(skip_ops_[t]) != std::string::npos) {
+        need_skip = true;
+        break;
+      }
+    }
+    if (!need_skip) {
+      op_name.push_back(op->Type());
+    }
+  }
+
+  VLOG(3) << "op name size: " << op_name.size();
+  op_total_time.resize(op_name.size());
+  for (size_t i = 0; i < op_total_time.size(); ++i) {
+    op_total_time[i] = 0.0;
+  }
+  platform::Timer timeline;
+  double total_time = 0.0;
+  double read_time = 0.0;
+  double pack_time = 0.0;
+  double pull_sparse_local_time = 0.0;
+  double op_all_time = 0;
+  double xpu_op_time = 0;
+  double xpu_wait_time = 0;
+  double cpu_op_time = 0;
+  double collect_label_time = 0;
+  double fill_sparse_time = 0;
+  double push_sparse_time = 0;
+
+  int batch_cnt = 0;
+  int done_cnt = 0;
+  int cur_batch;
+  uint64_t total_inst = 0;
+  wait_queue_.SetCap(1);
+  while (1) {
+    std::shared_ptr<HeterTask> task;
+    task = run_queue_.Get();
+    if (!task) {
+      double tmp_read_time;
+      timeline.Start();
+      cur_batch = device_reader_->Next();
+      timeline.Pause();
+      tmp_read_time = timeline.ElapsedSec();
+      if (cur_batch <= 0) {
+        if (batch_cnt == done_cnt) {
+          break;
+        } else {
+          continue;
+        }
+      }
+      batch_cnt += 1;
+      int taskid = batch_cnt * worker_num_ + thread_id_;
+      timeline.Start();
+      task = object_pool_.Get();
+      task->Reset();
+      task->PackTask(thread_scope_, taskid, device_reader_, cur_batch,
+                     program_);
+      timeline.Pause();
+      task->read_time = tmp_read_time;
+      task->pack_time = timeline.ElapsedSec();
+      task->total_time = tmp_read_time + task->pack_time;
+    }
+    for (;;) {
+      // pull sparse here
+      if (task->state_ == PULL_SPARSE) {
+        timeline.Start();
+        for (int i = 0;
+             i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).pull_sparse_table_id(i));
+          TableParameter table;
+          for (auto j : param_.sparse_table()) {
+            if (j.table_id() == tid) {
+              table = j;
+              break;
+            }
+          }
+          fleet_ptr_->HeterPullSparseVars(
+              thread_id_, task, tid, sparse_key_names_[tid], table.fea_dim(),
+              sparse_value_names_[tid]);
+        }
+        task->Update();
+        // JumpContext(task);
+        timeline.Pause();
+        task->pull_sparse_local_time += timeline.ElapsedSec();
+        task->total_time += timeline.ElapsedSec();
+      } else if (task->state_ == OP_RUN) {
+        // total_time += task->total_time;
+        // read_time += task->read_time;
+        // pack_time += task->pack_time;
+        // pull_sparse_local_time += task->pull_sparse_local_time;
+        for (int i = 0;
+             i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).pull_sparse_table_id(i));
+          timeline.Start();
+          CollectLabelInfo(task, i);
+          timeline.Pause();
+          task->collect_label_time += timeline.ElapsedSec();
+          task->total_time += timeline.ElapsedSec();
+          timeline.Start();
+          FillSparseValue(task, i);
+          timeline.Pause();
+          task->fill_sparse_time += timeline.ElapsedSec();
+          task->total_time += timeline.ElapsedSec();
+
+          auto nid_iter = std::find(sparse_value_names_[tid].begin(),
+                                    sparse_value_names_[tid].end(),
+                                    adjust_ins_weight_config_.nid_slot());
+          if (nid_iter != sparse_value_names_[tid].end()) {
+            AdjustInsWeight(task);
+          }
+        }
+
+        VLOG(3) << "fill sparse value for all sparse table done.";
+        // do computation here
+        // int run_op_idx = 0;
+        timeline.Start();
+        for (int i = 0; i < xpu_begin_op_index_; ++i) {
+          auto& op = ops_[i];
+          bool need_skip = false;
+          for (auto t = 0u; t < skip_ops_.size(); ++t) {
+            if (op->Type().find(skip_ops_[t]) != std::string::npos) {
+              need_skip = true;
+              break;
+            }
+          }
+          if (!need_skip) {
+            // timeline.Start();
+            op->Run(*(task->scope_), place_);
+            // timeline.Pause();
+            // op_total_time[run_op_idx++] += timeline.ElapsedSec();
+            // total_time += timeline.ElapsedSec();
+          }
+        }
+        task->Update();
+        timeline.Pause();
+        task->cpu_op_time += timeline.ElapsedSec();
+        task->total_time += timeline.ElapsedSec();
+      } else if (task->state_ == XPU) {
+        timeline.Start();
+        VLOG(3) << "call remote xpu taskid = " << task->taskid_;
+        std::vector<std::string> send_var_list;
+        for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) {
+          send_var_list.push_back(trainer_desc_.xpu_recv_list(i));
+        }
+        heter_ptr_->CallRemoteXpu(task, this, mpi_rank_, send_var_list);
+        timeline.Pause();
+        task->xpu_op_time += timeline.ElapsedSec();
+        task->total_time += timeline.ElapsedSec();
+        task->Update();
+        timeline.Start();
+        JumpContext(task);
+        timeline.Pause();
+        task->xpu_wait_time += timeline.ElapsedSec();
+        task->total_time += timeline.ElapsedSec();
+        break;
+      } else if (task->state_ == OP_RUN_END) {
+        timeline.Start();
+        for (size_t i = xpu_end_op_index_ + 1; i < ops_.size(); ++i) {
+          auto& op = ops_[i];
+          bool need_skip = false;
+          for (auto t = 0u; t < skip_ops_.size(); ++t) {
+            if (op->Type().find(skip_ops_[t]) != std::string::npos) {
+              need_skip = true;
+              break;
+            }
+          }
+          if (!need_skip) {
+            op->Run(*(task->scope_), place_);
+          }
+        }
+        // check inf and nan
+        for (std::string& var_name : check_nan_var_names_) {
+          Variable* var = (task->scope_)->FindVar(var_name);
+          if (var == nullptr) {
+            continue;
+          }
+          LoDTensor* tensor = var->GetMutable<LoDTensor>();
+          if (tensor == nullptr) {
+            continue;
+          }
+        }
+        task->Update();
+        timeline.Pause();
+        task->cpu_op_time += timeline.ElapsedSec();
+        task->total_time += timeline.ElapsedSec();
+      } else if (task->state_ == PUSH_GRAD) {
+        if (need_to_push_sparse_) {
+          // push gradients here
+          for (int i = 0;
+               i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+            uint64_t tid = static_cast<uint64_t>(
+                param_.program_config(0).push_sparse_table_id(i));
+            TableParameter table;
+            for (auto i : param_.sparse_table()) {
+              if (i.table_id() == tid) {
+                table = i;
+                break;
+              }
+            }
+            timeline.Start();
+            fleet_ptr_->HeterPushSparseVars(
+                task, tid, sparse_key_names_[tid], sparse_grad_names_[tid],
+                table.emb_dim(), &push_sparse_status_, use_cvm_, dump_slot_,
+                no_cvm_);
+            timeline.Pause();
+            task->push_sparse_time += timeline.ElapsedSec();
+            task->total_time += timeline.ElapsedSec();
+          }
+        }
+
+        if (need_to_push_sparse_) {
+          VLOG(3) << "push sparse gradient done.";
+          int32_t tmp_push_sparse_wait_times = -1;
+          static uint32_t push_sparse_wait_times =
+              static_cast<uint32_t>(tmp_push_sparse_wait_times);
+          if (push_sparse_status_.size() >= push_sparse_wait_times) {
+            for (auto& t : push_sparse_status_) {
+              t.wait();
+            }
+            push_sparse_status_.resize(0);
+          }
+
+          if (tmp_push_sparse_wait_times == -1) {
+            push_sparse_status_.resize(0);
+          }
+        }
+
+        // thread_scope_->DropKids();
+        task->Update();
+      } else if (task->state_ == DONE) {
+        PrintFetchVars();
+        ++done_cnt;
+        total_inst += task->cur_batch_;
+        object_pool_.Push(task);
+
+        total_time += task->total_time;
+        read_time += task->read_time;
+        pack_time += task->pack_time;
+        pull_sparse_local_time += task->pull_sparse_local_time;
+        op_all_time += task->op_all_time;
+        xpu_op_time += task->xpu_op_time;
+        xpu_wait_time += task->xpu_wait_time;
+        cpu_op_time += task->cpu_op_time;
+        collect_label_time += task->collect_label_time;
+        fill_sparse_time += task->fill_sparse_time;
+        push_sparse_time += task->push_sparse_time;
+        // ++batch_cnt;
+        if (thread_id_ == 0) {
+          // should be configured here
+          if (done_cnt > 0 && done_cnt % 100 == 0) {
+            // double op_sum_time = 0;
+            // std::unordered_map<std::string, double> op_to_time;
+            // for (size_t i = 0; i < op_total_time.size(); ++i) {
+            //   fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
+            //           op_name[i].c_str(), op_total_time[i] / done_cnt);
+            //   if (op_to_time.find(op_name[i]) == op_to_time.end()) {
+            //     op_to_time[op_name[i]] = 0.0;
+            //   }
+            //   op_to_time[op_name[i]] += op_total_time[i];
+            //   op_sum_time += op_total_time[i];
+            // }
+            // for (auto& i : op_to_time) {
+            //   fprintf(stderr, "op [%s] run total time: [%f]ms\n",
+            //           i.first.c_str(),
+            //           i.second / done_cnt);
+            // }
+            fprintf(stderr, "cpu op run total time: %fs\n",
+                    cpu_op_time / done_cnt);
+            fprintf(stderr, "xpu op run total time: %fs\n",
+                    xpu_op_time / done_cnt);
+            fprintf(stderr, "xpu wait total time: %fs\n",
+                    xpu_wait_time / done_cnt);
+            fprintf(stderr, "pack task time: %fs\n", pack_time / done_cnt);
+            fprintf(stderr, "train total time: %fs\n", total_time / done_cnt);
+            fprintf(stderr, "pull sparse local time: %fs\n",
+                    pull_sparse_local_time / done_cnt);
+            fprintf(stderr, "fill sparse time: %fs\n",
+                    fill_sparse_time / done_cnt);
+            fprintf(stderr, "push sparse time: %fs\n",
+                    push_sparse_time / done_cnt);
+            fprintf(stderr, "collect label time: %fs\n",
+                    collect_label_time / done_cnt);
+            fprintf(stderr, "mean read time: %fs\n", read_time / done_cnt);
+            fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100);
+            fprintf(stderr, "cpu op run percent: %f\n",
+                    cpu_op_time / total_time * 100);
+            fprintf(stderr, "xpu op run percent: %f\n",
+                    xpu_op_time / total_time * 100);
+            fprintf(stderr, "xpu wait percent: %f\n",
+                    xpu_wait_time / total_time * 100);
+            fprintf(stderr, "pack task percent: %f\n",
+                    pack_time / total_time * 100);
+            fprintf(stderr, "pull sparse local time percent: %f\n",
+                    pull_sparse_local_time / total_time * 100);
+            fprintf(stderr, "collect label time percent: %f\n",
+                    collect_label_time / total_time * 100);
+            fprintf(stderr, "fill sparse time percent: %f\n",
+                    fill_sparse_time / total_time * 100);
+            fprintf(stderr, "push sparse time percent: %f\n",
+                    push_sparse_time / total_time * 100);
+            fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
+          }
+        }
+        break;
+      }
+    }
+  }
+  if (copy_table_config_.need_copy()) {
+    CopySparseTable();
+    CopyDenseTable();
+    CopyDenseVars();
+  }
+}
+
+void HeterCpuWorker::TrainFiles() {
+  VLOG(3) << "Begin to train files";
+  platform::SetNumThreads(1);
+  device_reader_->Start();
+  int batch_cnt = 0;
+  int done_cnt = 0;
+  int cur_batch;
+  wait_queue_.SetCap(1);
+  need_to_push_dense_ = false;
+  while (1) {
+    // if (copy_table_config_.need_copy()) {
+    //   if (copy_table_config_.sparse_copy_by_feasign()) {
+    //     for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) {
+    //       uint64_t tid = copy_sparse_tables_[i].first;
+    //       feasign_set_[tid].insert(sparse_push_keys_[tid].begin(),
+    //                                sparse_push_keys_[tid].end());
+    //     }
+    //   }
+    //   if (batch_cnt % copy_table_config_.batch_num() == 0) {
+    //     CopySparseTable();
+    //     CopyDenseTable();
+    //     CopyDenseVars();
+    //   }
+    // }
+
+    std::shared_ptr<HeterTask> task;
+
+    task = run_queue_.Get();
+    if (!task) {
+      cur_batch = device_reader_->Next();
+      if (cur_batch <= 0) {
+        if (batch_cnt == done_cnt) {
+          break;
+        } else {
+          continue;
+        }
+      }
+      batch_cnt += 1;
+      int taskid = batch_cnt * worker_num_ + thread_id_;
+      task = object_pool_.Get();
+      task->Reset();
+      task->PackTask(thread_scope_, taskid, device_reader_, cur_batch,
+                     program_);
+    }
+    for (;;) {
+      // pull sparse here
+      if (task->state_ == PULL_SPARSE) {
+        VLOG(3) << "pull sparse taskid = " << task->taskid_;
+        for (int i = 0;
+             i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).pull_sparse_table_id(i));
+          TableParameter table;
+          for (auto j : param_.sparse_table()) {
+            if (j.table_id() == tid) {
+              table = j;
+              break;
+            }
+          }
+          fleet_ptr_->HeterPullSparseVars(
+              thread_id_, task, tid, sparse_key_names_[tid], table.fea_dim(),
+              sparse_value_names_[tid]);
+        }
+        task->Update();
+        // JumpContext(task);
+        // break;
+      } else if (task->state_ == OP_RUN) {
+        VLOG(3) << "oprun taskid = " << task->taskid_;
+        for (int i = 0;
+             i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).pull_sparse_table_id(i));
+          CollectLabelInfo(task, i);
+          FillSparseValue(task, i);
+          auto nid_iter = std::find(sparse_value_names_[tid].begin(),
+                                    sparse_value_names_[tid].end(),
+                                    adjust_ins_weight_config_.nid_slot());
+          if (nid_iter != sparse_value_names_[tid].end()) {
+            AdjustInsWeight(task);
+          }
+        }
+        VLOG(3) << "fill sparse value for all sparse table done.";
+        // do computation here
+        for (int i = 0; i < xpu_begin_op_index_; ++i) {
+          auto& op = ops_[i];
+          bool need_skip = false;
+          for (auto t = 0u; t < skip_ops_.size(); ++t) {
+            if (op->Type().find(skip_ops_[t]) != std::string::npos) {
+              need_skip = true;
+              break;
+            }
+          }
+          if (!need_skip) {
+            VLOG(3) << "run op: " << op->Type();
+            op->Run(*(task->scope_), place_);
+          }
+        }
+        task->Update();
+      } else if (task->state_ == XPU) {
+        VLOG(3) << "call remote xpu taskid = " << task->taskid_;
+        std::vector<std::string> send_var_list;
+        for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) {
+          send_var_list.push_back(trainer_desc_.xpu_recv_list(i));
+        }
+        heter_ptr_->CallRemoteXpu(task, this, mpi_rank_, send_var_list);
+        task->Update();
+        JumpContext(task);
+        break;
+      } else if (task->state_ == OP_RUN_END) {
+        for (size_t i = xpu_end_op_index_ + 1; i < ops_.size(); ++i) {
+          auto& op = ops_[i];
+          bool need_skip = false;
+          for (auto t = 0u; t < skip_ops_.size(); ++t) {
+            if (op->Type().find(skip_ops_[t]) != std::string::npos) {
+              need_skip = true;
+              break;
+            }
+          }
+          if (!need_skip) {
+            op->Run(*(task->scope_), place_);
+          }
+        }
+        // check inf and nan
+        for (std::string& var_name : check_nan_var_names_) {
+          Variable* var = (task->scope_)->FindVar(var_name);
+          if (var == nullptr) {
+            continue;
+          }
+          LoDTensor* tensor = var->GetMutable<LoDTensor>();
+          if (tensor == nullptr) {
+            continue;
+          }
+        }
+        task->Update();
+      } else if (task->state_ == PUSH_GRAD) {
+        VLOG(3) << "push grad taskid = " << task->taskid_;
+        if (need_to_push_sparse_) {
+          // push gradients here
+          for (int i = 0;
+               i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+            uint64_t tid = static_cast<uint64_t>(
+                param_.program_config(0).push_sparse_table_id(i));
+            TableParameter table;
+            for (auto i : param_.sparse_table()) {
+              if (i.table_id() == tid) {
+                table = i;
+                break;
+              }
+            }
+            fleet_ptr_->HeterPushSparseVars(
+                task, tid, sparse_key_names_[tid], sparse_grad_names_[tid],
+                table.emb_dim(), &push_sparse_status_, use_cvm_, dump_slot_,
+                no_cvm_);
+          }
+        }
+
+        if (need_to_push_sparse_) {
+          VLOG(3) << "push sparse gradient done.";
+          int32_t tmp_push_sparse_wait_times = -1;
+          static uint32_t push_sparse_wait_times =
+              static_cast<uint32_t>(tmp_push_sparse_wait_times);
+          if (push_sparse_status_.size() >= push_sparse_wait_times) {
+            for (auto& t : push_sparse_status_) {
+              t.wait();
+            }
+            push_sparse_status_.resize(0);
+          }
+
+          if (tmp_push_sparse_wait_times == -1) {
+            push_sparse_status_.resize(0);
+          }
+        }
+
+        // if (need_dump_field_) {
+        //   size_t batch_size = device_reader_->GetCurBatchSize();
+        //   std::vector<std::string> ars(batch_size);
+        //   for (auto& ar : ars) {
+        //     ar.clear();
+        //   }
+        //   auto& ins_id_vec = device_reader_->GetInsIdVec();
+        //   auto& ins_content_vec = device_reader_->GetInsContentVec();
+        //   for (size_t i = 0; i < ins_id_vec.size(); i++) {
+        //     ars[i] += ins_id_vec[i];
+        //     ars[i] = ars[i] + "\t" + ins_content_vec[i];
+        //   }
+        //   for (auto& field : dump_fields_) {
+        //     Variable* var = thread_scope_->FindVar(field);
+        //     if (var == nullptr) {
+        //       continue;
+        //     }
+        //     LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        //     if (!CheckValidOutput(tensor, batch_size)) {
+        //       continue;
+        //     }
+        //     for (size_t i = 0; i < batch_size; ++i) {
+        //       auto output_dim = tensor->dims()[1];
+        //       std::string output_dimstr =
+        //           boost::lexical_cast<std::string>(output_dim);
+        //       ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
+        //       auto bound = GetTensorBound(tensor, i);
+        //       ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
+        //     }
+        //   }
+        //   // #pragma omp parallel for
+        //   for (size_t i = 0; i < ars.size(); i++) {
+        //     if (ars[i].length() == 0) {
+        //       continue;
+        //     }
+        //     writer_ << ars[i];
+        //   }
+        //   if (need_dump_param_ && thread_id_ == 0) {
+        //     DumpParam();
+        //   }
+        // }
+
+        // thread_scope_->DropKids();
+        task->Update();
+      } else if (task->state_ == DONE) {
+        VLOG(3) << "done taskid = " << task->taskid_;
+        object_pool_.Push(task);
+        PrintFetchVars();
+        ++done_cnt;
+        // ++batch_cnt;
+        break;
+      }
+    }
+  }
+  if (need_dump_field_) {
+    // writer_.Flush();
+  }
+  if (copy_table_config_.need_copy()) {
+    CopySparseTable();
+    CopyDenseTable();
+    CopyDenseVars();
+  }
+}
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ca1aa66319228fc95f63294b15c981dd3c8ba30
--- /dev/null
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -0,0 +1,469 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <vector>
+#include "io/fs.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/trainer.h"
+#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+namespace paddle {
+namespace framework {
+
+void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
+                                 Dataset* dataset) {
+  srand((unsigned)time(NULL));
+  param_ = trainer_desc.downpour_param();
+  for (int i = 0; i < param_.dense_table_size(); ++i) {
+    uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
+    auto table = param_.dense_table(i);
+    dense_grad_names_[table_id].resize(table.dense_grad_name_size());
+    for (int j = 0; j < table.dense_grad_name_size(); ++j) {
+      dense_grad_names_[table_id][j] = table.dense_grad_name(j);
+    }
+  }
+  scale_datanorm_ = trainer_desc.scale_datanorm();
+  int place_num = trainer_desc.worker_places_size();
+  for (int i = 0; i < place_num; ++i) {
+    int num = trainer_desc.worker_places(i);
+    platform::CUDAPlace place = platform::CUDAPlace(num);
+    platform::CUDADeviceGuard guard(place.device);
+    cudaStream_t stream;
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+    copy_streams_.push_back(stream);
+    places_.push_back(place);
+    cudaEvent_t event;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    events_.push_back(event);
+  }
+  // thread_num_ = trainer_desc.thread_num();
+  // SetDataset(dataset);
+
+  // dump_fields_path_ = trainer_desc.dump_fields_path();
+  // dump_converter_ = trainer_desc.dump_converter();
+  // need_dump_field_ = false;
+  // if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
+  //   need_dump_field_ = true;
+  // }
+  // if (need_dump_field_) {
+  //   auto &file_list = dataset->GetFileList();
+  //   if (file_list.size() == 0) {
+  //     need_dump_field_ = false;
+  //   }
+  // }
+  // mpi_rank_ = trainer_desc.mpi_rank();
+  // mpi_size_ = trainer_desc.mpi_size();
+  // dump_file_num_ = trainer_desc.dump_file_num();
+  // const std::vector<paddle::framework::DataFeed *> readers =
+  //     dataset->GetReaders();
+  // thread_num_ = readers.size();
+  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
+       i++) {
+    need_merge_var_names_.push_back(
+        trainer_desc.downpour_param().stat_var_names(i));
+  }
+  running_ = true;
+  VLOG(3) << "going to initialize pull dense worker";
+  pull_dense_worker_ = PullDenseWorker::GetInstance();
+  pull_dense_worker_->Initialize(trainer_desc);
+  VLOG(3) << "initialize pull dense worker";
+  SetDebug(trainer_desc.debug());
+  fleet_ptr_ = FleetWrapper::GetInstance();
+  heter_ptr_ = HeterWrapper::GetInstance();
+  RegisterServiceHandler();
+  // for (int i = 0; i < trainer_desc.worker_places_size(); ++i) {
+  //   int num = trainer_desc.worker_places(i);
+  //   platform::CUDAPlace place = platform::CUDAPlace(num);
+  //   platform::CUDADeviceGuard guard(place.device);
+  //   cudaStream_t stream;
+  //   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+  //   copy_streams_.push_back(stream);
+  //   places_.push_back(place);
+  // }
+  trainer_desc_ = trainer_desc;
+}
+
+void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
+  auto place = places_[num];
+  Scope* scope = place_scopes_[num];
+  auto stream = copy_streams_[num];
+  auto event = events_[num];
+
+  auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+  platform::CUDADeviceGuard guard(dev_id);
+  auto& block = program.Block(0);
+  for (auto& var : block.AllVars()) {
+    if (var->Persistable()) {
+      auto name = var->Name();
+      Variable* root_var = root_scope_->FindVar(name);
+      LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+      auto* ptr = scope->Var(name);
+      InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+      LoDTensor* thread_tensor = ptr->GetMutable<LoDTensor>();
+
+#define HeterMemcpyFunc(cpp_type, proto_type)                           \
+  do {                                                                  \
+    if (root_tensor->type() == proto_type) {                            \
+      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
+    }                                                                   \
+  } while (0)
+      _ForEachDataType_(HeterMemcpyFunc);
+    }
+  }
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+  cudaEventSynchronize(event);
+}
+
+template <typename T>
+void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
+                                  LoDTensor* root_tensor,
+                                  const paddle::platform::Place& thread_place,
+                                  cudaStream_t stream) {
+  T* thread_ptr =
+      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
+  T* root_ptr = root_tensor->data<T>();
+  if (platform::is_cpu_place(root_tensor->place())) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
+                 platform::CPUPlace(), root_ptr,
+                 sizeof(T) * root_tensor->numel(), stream);
+  } else {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()),
+                 root_ptr, sizeof(T) * root_tensor->numel(), stream);
+  }
+}
+
+void HeterXpuTrainer::DumpWork(int tid) {}
+
+void HeterXpuTrainer::InitTrainerEnv(const ProgramDesc& main_program,
+                                     const platform::Place& place) {
+  CacheProgram(main_program);
+  place_ = place;
+  auto& profiler = paddle::ps::CostProfiler::instance();
+  profiler.register_profiler("xpu_service_run_task");
+  profiler.register_profiler("xpu_service_deserial");
+  profiler.register_profiler("xpu_service_launch_kernel");
+  profiler.register_profiler("xpu_service_wait");
+}
+
+void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
+  auto& block = main_program.Block(0);
+  pull_dense_worker_->SetRootScope(root_scope_);
+  pull_dense_worker_->CreatePinVar();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    Scope* scope = &(root_scope_->NewScope());
+    // for (auto &var : block.AllVars()) {
+    //   if (var->Persistable()) {
+    //     auto *ptr = scope->Var(var->Name());
+    //     InitializeVariable(ptr, var->GetType());
+    //   }
+    // }
+    place_scopes_.push_back(scope);
+    CreateThreadParam(main_program, i);
+    pull_dense_worker_->AddThreadScope(scope);
+    pull_dense_worker_->AddPlace(places_[i]);
+    pull_dense_worker_->AddStream(copy_streams_[i]);
+  }
+
+  pull_dense_worker_->Start();
+  for (auto& stream : copy_streams_) {
+    cudaStreamSynchronize(stream);
+  }
+  op_names_.clear();
+  for (auto& op_desc : block.AllOps()) {
+    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+    op_names_.push_back(op_desc->Type());
+    OperatorBase* local_op_ptr = local_op.release();
+    ops_.push_back(local_op_ptr);
+    continue;
+  }
+  xpu_begin_op_index_ = xpu_end_op_index_ = -1;
+  xpu_begin_op_index_ = trainer_desc_.xpu_start_idx();
+  xpu_end_op_index_ = trainer_desc_.xpu_end_idx();
+  VLOG(0) << "xpu begin: " << xpu_begin_op_index_
+          << " xpu end: " << xpu_end_op_index_;
+  // CHECK(xpu_begin_op_index_ == 0);
+  // CHECK(xpu_end_op_index_ = ops_.size() - 1);
+  //// init pool
+  for (size_t i = 0; i < 6; ++i) {
+    for (size_t j = 0; j < places_.size(); ++j) {
+      int num = j;
+      std::shared_ptr<HeterServiceContext> context =
+          std::make_shared<HeterServiceContext>();
+      context->place_num_ = num;
+      auto place = places_[num];
+      context->scope_ = &(place_scopes_[num]->NewScope());
+      auto& block = program_.Block(0);
+      for (auto& var : block.AllVars()) {
+        if (!var->Persistable()) {
+          auto* ptr = context->scope_->Var(var->Name());
+          InitializeVariable(ptr, var->GetType());
+        }
+      }
+      for (auto& v : dense_grad_names_) {
+        for (auto& name : v.second) {
+          auto* ptr = context->scope_->Var(name + "pin");
+          InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+        }
+      }
+      for (auto& op_desc : block.AllOps()) {
+        std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+        OperatorBase* local_op_ptr = local_op.release();
+        (context->ops_).push_back(local_op_ptr);
+      }
+      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+      platform::CUDADeviceGuard guard(dev_id);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+      object_pool_.Push(context);
+    }
+  }
+  VLOG(3) << "init other env done.";
+}
+
+void HeterXpuTrainer::Run() {}
+
+int HeterXpuTrainer::EndPass(const HeterRequest* request,
+                             HeterResponse* response) {
+  // int scope_num = object_pool_.Size();
+  for (size_t i = 0; i < need_merge_var_names_.size(); i++) {
+    Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]);
+    if (root_var == nullptr) {
+      continue;
+    }
+    LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+
+    for (size_t j = 0; j < place_scopes_.size(); j++) {
+      Scope* cur_thread_scope = place_scopes_[j];
+      Variable* thread_var =
+          cur_thread_scope->FindVar(need_merge_var_names_[i]);
+      if (thread_var == nullptr) {
+        continue;
+      }
+      LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
+//      if (root_tensor->numel() != thread_tensor->numel()) {
+//        continue;
+//      }
+#define MergeCallback(cpp_type, proto_type)                                    \
+  do {                                                                         \
+    if (root_tensor->type() == proto_type) {                                   \
+      if (thread_tensor->type() != proto_type) {                               \
+        VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
+                << "] " << need_merge_var_names_[i]                            \
+                << ", root tensor type=" << root_tensor->type()                \
+                << ", thread tensor type=" << thread_tensor->type();           \
+        exit(-1);                                                              \
+      }                                                                        \
+      MergeToRootScope<cpp_type>(root_tensor, thread_tensor);                  \
+    }                                                                          \
+  } while (0)
+      _ForEachDataType_(MergeCallback);
+      if (platform::is_gpu_place(thread_tensor->place())) {
+        auto dev_id =
+            BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
+        platform::CUDADeviceGuard guard(dev_id);
+        cudaMemset(thread_tensor->data<void>(), 0,
+                   thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+      } else {
+        memset(thread_tensor->data<void>(), 0,
+               thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+      }
+    }
+    auto* merge_var = response->add_vars();
+    heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
+                               merge_var);
+    if (platform::is_gpu_place(root_tensor->place())) {
+      auto dev_id =
+          BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
+      platform::CUDADeviceGuard guard(dev_id);
+      cudaMemset(root_tensor->data<void>(), 0,
+                 root_tensor->numel() * SizeOfType(root_tensor->type()));
+    } else {
+      memset(root_tensor->data<void>(), 0,
+             root_tensor->numel() * SizeOfType(root_tensor->type()));
+    }
+  }
+  return 0;
+}
+
+template <typename T>
+void HeterXpuTrainer::MergeToRootScope(LoDTensor* root_tensor,
+                                       LoDTensor* tensor) {
+  LoDTensor tmp_root;
+  TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
+  T* tmp_root_data = tmp_root.data<T>();
+  LoDTensor tmp_tensor;
+  TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
+  T* data = tmp_tensor.data<T>();
+  for (int i = 0; i < tmp_tensor.numel(); i++) {
+    tmp_root_data[i] += data[i];
+  }
+  TensorCopy(tmp_root, root_tensor->place(), root_tensor);
+}
+
+int HeterXpuTrainer::StopService(const HeterRequest* request,
+                                 HeterResponse* response) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  running_ = false;
+  cond_.notify_one();
+  return 0;
+}
+
+int HeterXpuTrainer::RunTask(const HeterRequest* request,
+                             HeterResponse* response) {
+  auto timer = std::make_shared<paddle::ps::CostTimer>("xpu_service_run_task");
+  std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
+
+  if (!context->scope_) {
+    int num = rand() % places_.size();
+    context->place_num_ = num;
+    auto place = places_[num];
+    context->scope_ = &(place_scopes_[num]->NewScope());
+    auto& block = program_.Block(0);
+    for (auto& var : block.AllVars()) {
+      if (!var->Persistable()) {
+        auto* ptr = context->scope_->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+      }
+    }
+    for (auto& v : dense_grad_names_) {
+      for (auto& name : v.second) {
+        auto* ptr = context->scope_->Var(name + "pin");
+        InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+      }
+    }
+    for (auto& op_desc : block.AllOps()) {
+      std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+      OperatorBase* local_op_ptr = local_op.release();
+      (context->ops_).push_back(local_op_ptr);
+    }
+
+    auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    platform::CUDADeviceGuard guard(dev_id);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+  }
+
+  context->Reset();
+  auto place = places_[context->place_num_];
+  {
+    auto deserial_timer =
+        std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
+    for (int i = 0; i < request->vars_size(); ++i) {
+      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
+                                      copy_streams_[context->place_num_]);
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
+    while (cudaEventQuery(context->event_) != cudaSuccess) {
+      VLOG(3) << "wait for kernel";
+      bthread_yield();
+    }
+  }
+
+  {
+    auto launch_timer =
+        std::make_shared<paddle::ps::CostTimer>("xpu_service_launch_kernel");
+    for (int i = xpu_begin_op_index_; i <= xpu_end_op_index_; ++i) {
+      auto& op = (context->ops_)[i];
+      op->Run(*(context->scope_), place);
+    }
+  }
+  auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
+      platform::DeviceContextPool::Instance().Get(place));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaEventRecord(context->event_, dev_ctx->stream()));
+  // cudaEventSynchronize(context->event_);
+  {
+    auto wait_timer =
+        std::make_shared<paddle::ps::CostTimer>("xpu_service_wait");
+    while (cudaEventQuery(context->event_) != cudaSuccess) {
+      VLOG(3) << "wait for kernel";
+      bthread_yield();
+    }
+  }
+
+  for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
+    const std::string& varname = trainer_desc_.xpu_send_list(i);
+    // CHECK(varname == "concat_1.tmp_0@GRAD");
+    auto* res_var = response->add_vars();
+    heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
+  }
+
+  // std::string varname = "concat_1.tmp_0@GRAD";
+  //
+  // auto* res_var = response->add_vars();
+  // heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
+  for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+       ++i) {
+    uint64_t tid =
+        static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+    fleet_ptr_->PushDenseVarsAsync(
+        *(context->scope_), tid, dense_grad_names_[tid],
+        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
+        places_[context->place_num_], copy_streams_[context->place_num_],
+        context->event_);
+  }
+  for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+       ++i) {
+    uint64_t tid =
+        static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+    pull_dense_worker_->IncreaseThreadVersion(0, tid);
+  }
+  VLOG(3) << "push dense gradient done.";
+  context->scope_->DropKids();
+  object_pool_.Push(context);
+  VLOG(0) << "pool size " << object_pool_.Size();
+  return 0;
+}
+
+void HeterXpuTrainer::RegisterServiceHandler() {
+  heter_ptr_->RegisterServiceHandler(
+      0, [this](const HeterRequest* request, HeterResponse* response) -> int {
+        return this->RunTask(request, response);
+      });
+  heter_ptr_->RegisterServiceHandler(
+      1, [this](const HeterRequest* request, HeterResponse* response) -> int {
+        return this->EndPass(request, response);
+      });
+  heter_ptr_->RegisterServiceHandler(
+      2, [this](const HeterRequest* request, HeterResponse* response) -> int {
+        return this->StopService(request, response);
+      });
+}
+
+Scope* HeterXpuTrainer::GetWorkerScope(int thread_id) { return nullptr; }
+
+void HeterXpuTrainer::Finalize() {
+  // for (auto &th : threads_) {
+  //   th.join();
+  // }
+  std::unique_lock<std::mutex> lock(mutex_);
+  cond_.wait(lock, [this] { return !running_; });
+  sleep(3);
+  pull_dense_worker_->Stop();
+  root_scope_->DropKids();
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index c258028e25066d14820017edaaa103b39c57158d..316f8f9c7515ee0255a261645caa38a9807c3fc3 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -16,9 +16,6 @@
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef ON_INFER
-#include "paddle/fluid/inference/api/paddle_api.h"
-#endif
 namespace paddle {
 namespace framework {
 
@@ -59,7 +56,7 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher(
 }
 
 }  // namespace framework
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) {
   return framework::CipherFactory::CreateCipher(config_file);
 }
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 60e4ac8cbcfd8cc8f1d14363538fe1e118b953cd..9d3e0806ac79d838765ca5a4bbf61d0f67ab6ed5 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
               paddle::framework::ir::ConvTransposeBNFusePass);
 REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
               paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
+REGISTER_PASS(depthwise_conv_bn_fuse_pass,
+              paddle::framework::ir::DepthwiseConvBNFusePass);
+REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
+              paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index fcdbcf299c504c00b3027207bc2f4ac019d48ffc..57a9f69ca15af2759874a1e2a0b58399de652693 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
   std::string conv_type() const { return "conv2d_transpose"; }
 };
 
+class DepthwiseConvBNFusePass : public ConvBNFusePass {
+ public:
+  std::string conv_type() const { return "depthwise_conv2d"; }
+};
+
+class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
+ public:
+  std::string conv_type() const { return "depthwise_conv2d"; }
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 431d3c05f6dd4b44074729716555744773f950e7..55449856d189065388facf3e3ce736f505e976fb 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -68,11 +68,35 @@ static bool HasInput(Node* n, std::string name) {
   return input_names_set.find(name) != input_names_set.end();
 }
 
+static Node* GetInputVar(Node* n, const std::string& name) {
+  PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
+                    platform::errors::InvalidArgument(
+                        "Expected node %p to be an operator node.", n));
+  for (auto* in : n->inputs) {
+    if (in->Name() == name) {
+      return in;
+    }
+  }
+  return nullptr;
+}
+
+static Node* GetOutputVar(Node* n, const std::string& name) {
+  PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
+                    platform::errors::InvalidArgument(
+                        "Expected node %p to be an operator node.", n));
+  for (auto* out : n->outputs) {
+    if (out->Name() == name) {
+      return out;
+    }
+  }
+  return nullptr;
+}
+
 std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
     SubGraph* subgraph) {
-  std::unordered_map<std::string, int> var_ids = EncodeVarNodes(subgraph);
-  std::vector<Node*> intermediate_out_nodes =
-      subgraph->GetIntermediateOutVarNodes();
+  std::unordered_map<Node*, int> var_ids = EncodeVarNodes(subgraph);
+  std::unordered_set<Node*> intermediate_out_vars_set =
+      subgraph->GetIntermediateOutVarNodesSet();
   std::vector<OperationExpression> expressions;
   for (auto* node : subgraph->SortedNodes()) {
     if (node && node->IsOp() && node->Op()) {
@@ -92,11 +116,12 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
         // "elementwise_add_grad", where "X", "Y" and "Out" are not used.
         if ((HasInput(node, name) && op->Input(name).size() >= 1U)) {
           for (size_t i = 0; i < op->Input(name).size(); i++) {
+            Node* input_var = GetInputVar(node, op->Input(name)[i]);
             PADDLE_ENFORCE_NE(
-                var_ids.find(op->Input(name)[i]), var_ids.end(),
+                var_ids.find(input_var), var_ids.end(),
                 platform::errors::InvalidArgument(
                     "Input(%s) of operation %s is not set.", name, op->Type()));
-            input_ids.push_back(var_ids[op->Input(name)[i]]);
+            input_ids.push_back(var_ids[input_var]);
           }
         } else {
           input_ids.push_back(-1);
@@ -106,31 +131,29 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
       // Output ids should be set in fixed order, like:
       //  - dx, dy in backward operations
       std::vector<int> output_ids;
+      std::vector<int> intermediate_output_ids;
       std::vector<std::string> output_names =
           OperationMap::Instance().Get(op->Type()).output_names;
-      std::unordered_map<int, bool> intermediate_state;
 
       for (auto& name : output_names) {
+        Node* output_var = GetOutputVar(node, op->Output(name)[0]);
         PADDLE_ENFORCE_NE(
-            var_ids.find(op->Output(name)[0]), var_ids.end(),
+            var_ids.find(output_var), var_ids.end(),
             platform::errors::InvalidArgument(
                 "Output(%s) of operation %s is not set.", name, op->Type()));
-        output_ids.push_back(var_ids[op->Output(name)[0]]);
-        bool enable_intermediate = false;
-        for (auto* n : intermediate_out_nodes) {
-          if (n->Name() == op->Output(name)[0]) {
-            enable_intermediate = true;
-            break;
-          }
+        output_ids.push_back(var_ids[output_var]);
+        if (!subgraph->SaveIntermediateOut() &&
+            intermediate_out_vars_set.find(output_var) !=
+                intermediate_out_vars_set.end()) {
+          intermediate_output_ids.push_back(var_ids[output_var]);
         }
-        intermediate_state[var_ids[op->Output(name)[0]]] = enable_intermediate;
       }
 
       std::string lhs_type = ExtractDataType(node->outputs);
       std::string rhs_type = ExtractDataType(node->inputs);
       auto expression =
           OperationExpression(node->Name(), input_ids, output_ids, rhs_type,
-                              lhs_type, intermediate_state);
+                              lhs_type, intermediate_output_ids);
       expression.SetAttr(attr);
       expressions.push_back(expression);
     }
@@ -146,17 +169,18 @@ std::string CodeGenerator::Generate(
   // TODO(liuyiqun): Check whether all expressions are elementwise operations.
   std::set<int> input_ids = std::move(DistilInputIds(expressions));
   std::set<int> output_ids = std::move(DistilOutputIds(expressions));
-  std::set<int> intermediate_ids =
+  std::set<int> intermediate_output_ids =
       std::move(DistilIntermediateIds(expressions));
   std::unordered_map<int, std::string> dtypes =
       std::move(DistilDtypes(expressions));
   TemplateVariable template_var;
   template_var.Add("func_name", func_name);
-  template_var.Add("parameters", EmitParameters(input_ids, output_ids,
-                                                intermediate_ids, dtypes));
+  template_var.Add(
+      "parameters",
+      EmitParameters(input_ids, output_ids, intermediate_output_ids, dtypes));
   template_var.Add("compute_body",
                    EmitComputeBody(expressions, input_ids, output_ids,
-                                   intermediate_ids, dtypes));
+                                   intermediate_output_ids, dtypes));
 
   std::set<std::string> all_dtype;
   for (const auto& type : dtypes) {
@@ -204,18 +228,14 @@ std::set<int> CodeGenerator::DistilOutputIds(
 
 std::set<int> CodeGenerator::DistilIntermediateIds(
     const std::vector<OperationExpression>& expressions) {
-  std::set<int> intermediate_ids;
+  std::set<int> intermediate_output_ids;
   // Use std::set to remove the reptead id and get a ordered list.
   for (size_t i = 0; i < expressions.size(); i++) {
-    for (auto id : expressions[i].GetOutputIds()) {
-      auto intermediate_state = expressions[i].GetIntermediateState();
-      if (intermediate_state.find(id) != intermediate_state.end() &&
-          intermediate_state[id]) {
-        intermediate_ids.insert(id);
-      }
+    for (auto id : expressions[i].GetIntermediateOutputIds()) {
+      intermediate_output_ids.insert(id);
     }
   }
-  return intermediate_ids;
+  return intermediate_output_ids;
 }
 
 std::unordered_map<int, std::string> CodeGenerator::DistilDtypes(
@@ -316,26 +336,29 @@ std::string CodeGenerator::EmitComputeBody(
   return load.str() + compute.str() + store.str();
 }
 
-std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes(
+std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
     SubGraph* subgraph) {
   const auto& input_var_nodes = subgraph->GetInputVarNodes();
-  const auto& output_var_nodes = subgraph->GetOutputVarNodes();
+  // Encode all var nodes, including intermediate output var nodes.
+  const auto& output_var_nodes = subgraph->GetOutputVarNodes(true);
 
   int id = 0;
-  std::unordered_map<std::string, int> var_ids;
+  std::unordered_map<Node*, int> var_ids;
   // Numbering input vars.
   for (auto* in : input_var_nodes) {
-    VLOG(3) << "Encoding input names:" << in->Name() << ", id:" << id;
-    if (var_ids.find(in->Name()) == var_ids.end()) {
-      var_ids[in->Name()] = id++;
+    VLOG(3) << "Encoding input names:" << in->Name() << "(" << in
+            << "), id:" << id;
+    if (var_ids.find(in) == var_ids.end()) {
+      var_ids[in] = id++;
     }
   }
 
   // Encoding output vars.
   for (auto* out : output_var_nodes) {
-    VLOG(3) << "Ecoding output names:" << out->Name() << ", id:" << id;
-    if (var_ids.find(out->Name()) == var_ids.end()) {
-      var_ids[out->Name()] = id++;
+    VLOG(3) << "Ecoding output names:" << out->Name() << "(" << out
+            << "), id:" << id;
+    if (var_ids.find(out) == var_ids.end()) {
+      var_ids[out] = id++;
     }
   }
   return var_ids;
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.h b/paddle/fluid/framework/ir/fusion_group/code_generator.h
index 2b18657bbcfbe81d4504306a3753d5c0b82092fd..21773f239b9f6e5208aea45f481bf6f92745033f 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h
@@ -61,7 +61,7 @@ class CodeGenerator {
       const std::unordered_map<int, std::string>& dtypes) const;
 
   // Encode all var nodes in the subgraph with an unique number.
-  std::unordered_map<std::string, int> EncodeVarNodes(SubGraph* subgraph);
+  std::unordered_map<Node*, int> EncodeVarNodes(SubGraph* subgraph);
 
  private:
   std::vector<CodeTemplate> code_templates_;
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
index 03d28277afbbb7467f638d521414d702bc8e8179..910f71e65bed10a515f9401a5b09a27ba0929fcf 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -48,20 +48,20 @@ class OperationExpression {
       std::string op_type, const std::vector<int>& input_ids,
       const std::vector<int>& output_ids, std::string rhs_type,
       std::string lhs_type,
-      const std::unordered_map<int, bool>& intermediate_state = {})
+      const std::vector<int>& intermediate_output_ids = {})
       : op_type_(op_type),
         input_ids_(input_ids),
         output_ids_(output_ids),
         rhs_type_(rhs_type),
         lhs_type_(lhs_type),
-        intermediate_state_(intermediate_state) {}
+        intermediate_output_ids_(intermediate_output_ids) {}
 
   std::string GetOpType() const { return op_type_; }
-  std::unordered_map<int, bool> GetIntermediateState() const {
-    return intermediate_state_;
-  }
   std::vector<int> GetInputIds() const { return input_ids_; }
   std::vector<int> GetOutputIds() const { return output_ids_; }
+  std::vector<int> GetIntermediateOutputIds() const {
+    return intermediate_output_ids_;
+  }
   std::string GetRHSType() const { return rhs_type_; }
   std::string GetLHSType() const { return lhs_type_; }
   void SetAttr(AttributeMap attr) { attr_ = attr; }
@@ -84,7 +84,7 @@ class OperationExpression {
   AttributeMap attr_;
   std::string rhs_type_;
   std::string lhs_type_;
-  std::unordered_map<int, bool> intermediate_state_;
+  std::vector<int> intermediate_output_ids_;
 };
 
 class TemplateVariable {
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 89b05fc577bb46606ff5c43d0dd697bd7b8aed38..ebc89b14c265d3491f0f9bc64a36f52c6c9f2a18 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -144,7 +144,6 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
       LOG(INFO) << "Precision check failed from i = " << id
                 << ", expect: " << expect << ", actual: " << actual;
       EXPECT_LT(fabs(actual - expect), eps);
-      break;
     }
   }
 }
@@ -465,7 +464,7 @@ TEST(code_generator, subgraph) {
   for (std::string dtype : {"float", "__half"}) {
     std::unique_ptr<paddle::framework::ir::Graph> graph =
         BuildGraph(false, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false,
+    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
                                     graph->Nodes());
 
     // Expressions generated by code_generator (they may be different):
@@ -484,7 +483,7 @@ TEST(code_generator, subgraph_grad) {
   for (std::string dtype : {"float", "__half"}) {
     std::unique_ptr<paddle::framework::ir::Graph> graph =
         BuildGraph(true, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false,
+    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
                                     DistilGradNodes(graph));
 
     // Expressions generated by code_generator (they may be different):
diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
index 5de253bb96743dc18b6394f04d7818a090a114c2..f6262762a2af6e1abec47fca2bce85a74116b5fd 100644
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
@@ -63,7 +63,7 @@ static bool IsEqualAndNotEmpty(const std::vector<int64_t>& l,
 bool GroupDetector::CheckPrecondition(const Node* n) {
   auto check_data_type = [&](const std::vector<Node*>& nodes) -> bool {
     bool is_first = true;
-    proto::VarType::Type data_type_0;
+    proto::VarType::Type data_type_0 = proto::VarType::BOOL;
     for (auto* n : nodes) {
       if (n && n->IsVar() && n->Var()) {
         if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
index 883347085926f08adb877d6a7fbe8e5c5e8e1c50..2cf71cdcefcd595c85da63ecb0782d16de5dddb8 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -63,11 +63,6 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
         std::unordered_set<Node*>(vec.begin(), vec.end()));
     VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n";
 
-    // In elementwise fused kernel, memory is the bound of execution,
-    // here we remove the output id to use less memory and less time.
-    if (subgraph.RemoveIntermediateOut()) {
-      subgraph.DetectIntermediateOutWithGraph(graph);
-    }
     if (subgraph.IsValid(min_subgraph_size)) {
       subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
       if (GenerateCode(&subgraph)) {
@@ -115,57 +110,52 @@ static int ExtractOpRole(fusion_group::SubGraph* subgraph) {
 
 void FusionGroupPass::InsertFusionGroupOp(
     Graph* graph, fusion_group::SubGraph* subgraph) const {
-  const std::vector<Node*>& input_vars_of_subgraph =
-      subgraph->GetInputVarNodes();
-  const std::vector<Node*>& output_vars_of_subgraph =
-      subgraph->GetOutputVarNodes();
-  const std::vector<Node*> intermediate_vars_of_subgraph =
-      subgraph->GetIntermediateOutVarNodes();
+  const std::vector<Node*>& input_vars = subgraph->GetInputVarNodes();
+  const std::vector<Node*>& output_vars =
+      subgraph->GetOutputVarNodes(subgraph->SaveIntermediateOut());
   std::unordered_set<Node*> external_nodes;
 
-  OpDesc op_desc;
-  op_desc.SetType("fusion_group");
-
+  // Prepare inputs.
   std::vector<std::string> input_names;
-  std::vector<std::string> inputs_data_types;
-  for (auto* n : input_vars_of_subgraph) {
-    input_names.push_back(n->Name());
-    inputs_data_types.push_back(DataTypeToString(n->Var()->GetDataType()));
-    external_nodes.insert(n);
+  std::vector<int> input_dtypes;
+  std::unordered_set<Node*> output_vars_set(output_vars.begin(),
+                                            output_vars.end());
+  for (auto* n : input_vars) {
+    // It is not an output var node.
+    if (output_vars_set.find(n) == output_vars_set.end()) {
+      input_names.push_back(n->Name());
+      input_dtypes.push_back(n->Var()->GetDataType());
+      external_nodes.insert(n);
+    }
   }
-  op_desc.SetInput("Inputs", input_names);
 
+  // Prepare outputs.
   std::vector<std::string> output_names;
-  std::vector<std::string> outs_data_types;
-  std::vector<Node*> output_var_without_intermediate;
-  for (auto* n : output_vars_of_subgraph) {
-    auto it_input =
-        find(input_vars_of_subgraph.begin(), input_vars_of_subgraph.end(), n);
-    auto it_intermediate = find(intermediate_vars_of_subgraph.begin(),
-                                intermediate_vars_of_subgraph.end(), n);
-    if (it_intermediate == intermediate_vars_of_subgraph.end() &&
-        it_input == input_vars_of_subgraph.end()) {
-      output_names.push_back(n->Name());
-      outs_data_types.push_back(DataTypeToString(n->Var()->GetDataType()));
-      output_var_without_intermediate.push_back(n);
-    }
+  std::vector<int> output_dtypes;
+  for (auto* n : output_vars) {
+    output_names.push_back(n->Name());
+    output_dtypes.push_back(n->Var()->GetDataType());
     external_nodes.insert(n);
   }
 
+  OpDesc op_desc;
+  op_desc.SetType("fusion_group");
+  op_desc.SetInput("Inputs", input_names);
   op_desc.SetOutput("Outs", output_names);
-  op_desc.SetAttr("inputs_data_type", inputs_data_types);
-  op_desc.SetAttr("outs_data_type", outs_data_types);
+  op_desc.SetAttr("inputs_dtype", input_dtypes);
+  op_desc.SetAttr("outs_dtype", output_dtypes);
   op_desc.SetAttr("type", subgraph->GetType());
   op_desc.SetAttr("func_name", subgraph->GetFuncName());
   op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                   ExtractOpRole(subgraph));
 
   Node* fusion_group_node = graph->CreateOpNode(&op_desc);
-  for (auto* in : input_vars_of_subgraph) {
-    IR_NODE_LINK_TO(in, fusion_group_node);
+  for (auto* in : input_vars) {
+    if (output_vars_set.find(in) == output_vars_set.end()) {
+      IR_NODE_LINK_TO(in, fusion_group_node);
+    }
   }
-
-  for (auto* out : output_var_without_intermediate) {
+  for (auto* out : output_vars) {
     IR_NODE_LINK_TO(fusion_group_node, out);
   }
 
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index b127d132bafb00f32e7e6c5d2681d6f0e78b4c34..921cf0904f632936862b18b2f083f18a33c760be 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -105,12 +105,6 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}",
                  {"${2} * (%{1.0} - ${1} * ${1})"});
 
-  // cast:
-  // out = static_cast<T>(x)
-  // TODO(wangchaochaohu): This is not the compelete definition of
-  // cast Op, We need refine it later.
-  insert_handler("cast", "${0}", {});
-
   // sqrt:
   //  out = x^(1/2)
   //  dx = dout * 0.5 / out
@@ -121,11 +115,21 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   //  dx = dout * 2.0 * x
   insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"});
 
+  // assign:
+  //  out = x
+  insert_handler("assign", "${0}", {});
+
+  // cast:
+  //  out = static_cast<T>(x)
+  // TODO(wangchaochaohu): This is not the compelete definition of
+  //  cast Op, We need refine it later.
+  insert_handler("cast", "${0}", {});
+
   // scale
-  // out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
-  // here we use '=' operator to seperate th default value
+  //  out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
+  //  here we use '=' operator to seperate th default value
   // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
-  // bias.
+  //  bias.
   insert_handler(
       "scale",
       "${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + "
diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h
index 66b17e9f6fe95519b7687bc1c5725684c5c98610..5a29e875aea615c36711aa7dc044e4e1f563c297 100644
--- a/paddle/fluid/framework/ir/fusion_group/subgraph.h
+++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h
@@ -66,11 +66,12 @@ class SubGraph {
   }
 
   int GetType() const { return type_; }
-  bool RemoveIntermediateOut() { return !save_intermediate_out_; }
 
   void SetFuncName(std::string func_name) { func_name_ = func_name; }
   std::string GetFuncName() const { return func_name_; }
 
+  bool SaveIntermediateOut() const { return save_intermediate_out_; }
+
   const std::unordered_set<Node*>& Nodes() const { return nodes_set_; }
   const std::vector<Node*>& SortedNodes() {
     if (!is_sorted_) {
@@ -118,66 +119,88 @@ class SubGraph {
     return input_vars;
   }
 
-  std::vector<Node*> GetOutputVarNodes() {
+  std::vector<Node*> GetOutputVarNodes(bool with_intermediate_out) {
     // The order of output nodes should be consistant anywhere..
-    std::vector<Node*> output_vars_all;
+    std::vector<Node*> output_vars;
     for (auto* n : SortedNodes()) {
-      if (n && n->IsVar() && n->Var()) {
+      if (IsOutputOfInternalOp(n)) {
         // If the var_node is the output of some op_node in the subgraph, it
         // is considered the output var node of the subgraph.
-        bool is_found = false;
-        for (auto* in : n->inputs) {
-          if (Has(in)) {
-            is_found = true;
+        if (with_intermediate_out) {
+          output_vars.push_back(n);
+        } else {
+          if (n->outputs.empty() || IsInputOfExternalOp(n)) {
+            output_vars.push_back(n);
           }
         }
-        if (is_found) {
-          output_vars_all.push_back(n);
-        }
       }
     }
-    return output_vars_all;
+    return output_vars;
   }
 
   std::vector<Node*> GetIntermediateOutVarNodes() {
-    return intermediate_out_nodes_;
+    // Intermediate output var nodes: the output of some op_node in the
+    // subgraph, but not referenced outside the subgraph.
+    std::vector<Node*> intermediate_out_vars;
+    for (auto* n : SortedNodes()) {
+      if (IsOutputOfInternalOp(n) && IsInputOfInternalOp(n) &&
+          !IsInputOfExternalOp(n)) {
+        // When the outputs size is 0, it is also considered a intermidiate
+        // output. It maybe an unused output or the fetching vars, so that we
+        // cannot eleiminate it directly here.
+        intermediate_out_vars.push_back(n);
+      }
+    }
+    return intermediate_out_vars;
   }
 
-  void DetectIntermediateOutWithGraph(Graph* graph) {
-    auto graph_nodes = graph->Nodes();
-
-    for (auto* n : SortedNodes()) {
-      bool enable_remove = true;
+  std::unordered_set<Node*> GetIntermediateOutVarNodesSet() {
+    std::vector<Node*> intermediate_out_vars = GetIntermediateOutVarNodes();
+    return std::unordered_set<Node*>(intermediate_out_vars.begin(),
+                                     intermediate_out_vars.end());
+  }
 
-      if (n && n->IsVar() && n->Var()) {
-        bool leaf_graph = true;
-        for (auto* node : graph_nodes) {
-          if (node->IsOp()) {
-            auto inputs = node->inputs;
-            for (auto* in : inputs) {
-              if (in && in->Name() == n->Name()) {
-                if (!Has(node)) enable_remove = false;
-                leaf_graph = false;
-              }
-            }
-          }
-          if (!enable_remove) {
-            break;
-          }
+ private:
+  bool IsInputOfInternalOp(Node* n) {
+    bool is_input_of_internal_op = false;
+    if (Has(n) && n && n->IsVar() && n->Var()) {
+      for (auto* out : n->outputs) {
+        if (Has(out)) {
+          is_input_of_internal_op = true;
+          break;
         }
-        if (leaf_graph) enable_remove = false;
+      }
+    }
+    return is_input_of_internal_op;
+  }
 
-      } else {
-        enable_remove = false;
+  bool IsInputOfExternalOp(Node* n) {
+    // If n is the input any one node outside the subgraph.
+    bool is_input_of_external_op = false;
+    if (Has(n) && n && n->IsVar() && n->Var()) {
+      for (auto* out : n->outputs) {
+        if (!Has(out)) {
+          is_input_of_external_op = true;
+          break;
+        }
       }
+    }
+    return is_input_of_external_op;
+  }
 
-      if (enable_remove) {
-        intermediate_out_nodes_.push_back(n);
+  bool IsOutputOfInternalOp(Node* n) {
+    bool is_output_of_internal_op = false;
+    if (Has(n) && n && n->IsVar() && n->Var()) {
+      for (auto* in : n->inputs) {
+        if (Has(in)) {
+          is_output_of_internal_op = true;
+          break;
+        }
       }
     }
+    return is_output_of_internal_op;
   }
 
- private:
   void TopologicalSort() {
     if (!is_sorted_) {
       std::unordered_map<Node*, std::vector<Node*>> inputs_map;
@@ -236,7 +259,6 @@ class SubGraph {
   bool save_intermediate_out_{true};
 
   std::unordered_set<Node*> nodes_set_;
-  std::vector<Node*> intermediate_out_nodes_{};
   bool is_sorted_{false};
   std::vector<Node*> sorted_nodes_;
 };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 23419d5b9e0a20adcb6245a5a5aa4c5c4b5f3a34..aa0979b4be64ae3ccebbd7cc82abcf4a4712527a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -54,7 +55,7 @@ void LogQuantizationDisabled(Node* op) {
   std::stringstream msg_ss;
   VLOG(4) << "Qantization skipped for operator " << op->Name()
           << " (type: " << op->Op()->Type() << ", id: " << op->id()
-          << "). Attribute use_quantizer = false.";
+          << "). Attribute mkldnn_data_type != \"int8\".";
 }
 
 }  // namespace
@@ -228,12 +229,12 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node,
 
 bool CPUQuantizePass::IsOpDequantized(const Node* node) const {
   return node->Op()->Type() == "dequantize" ||
-         node->Op()->GetAttrIfExists<bool>("use_quantizer");
+         platform::HasOpINT8DataType(node->Op());
 }
 
 bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
   return node->Op()->Type() == "quantize" ||
-         node->Op()->GetAttrIfExists<bool>("use_quantizer");
+         platform::HasOpINT8DataType(node->Op());
 }
 
 void CPUQuantizePass::QuantizeConv(Graph* graph,
@@ -248,10 +249,9 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
                      Graph* g) {
     VLOG(4) << "Quantize conv2d op";
     GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-    auto* conv_op_desc = conv_op->Op();
 
     // skip if should not be quantized
-    if (!conv_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(conv_op->Op())) {
       LogQuantizationDisabled(conv_op);
       return;
     }
@@ -353,14 +353,13 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize fc op";
     GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
-    auto* fc_op_desc = fc->Op();
 
     // skip if should not be quantized
-    if (!fc_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(fc->Op())) {
       LogQuantizationDisabled(fc);
       return;
     }
-    if (!fc_op_desc->GetAttrIfExists<bool>("use_mkldnn")) {
+    if (!fc->Op()->GetAttrIfExists<bool>("use_mkldnn")) {
       return;
     }
 
@@ -420,10 +419,9 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize pool2d op";
     GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
-    auto* pool_op_desc = pool_op->Op();
 
     // skip if should not be quantized
-    if (!pool_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(pool_op->Op())) {
       LogQuantizationDisabled(pool_op);
       return;
     }
@@ -465,10 +463,9 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize concat op";
     GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern);
-    auto* concat_op_desc = concat_op->Op();
 
     // skip if should not be quantized
-    if (!concat_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(concat_op->Op())) {
       LogQuantizationDisabled(concat_op);
       return;
     }
@@ -511,10 +508,9 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize prior_box op";
     GET_IR_NODE_FROM_SUBGRAPH(prior_box_op, prior_box_op, prior_box_pattern);
-    auto* prior_box_op_desc = prior_box_op->Op();
 
     // skip if should not be quantized
-    if (!prior_box_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(prior_box_op->Op())) {
       LogQuantizationDisabled(prior_box_op);
       return;
     }
@@ -554,10 +550,9 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize transpose op";
     GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern);
-    auto* transpose_op_desc = transpose_op->Op();
 
     // skip if should not be quantized
-    if (!transpose_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(transpose_op->Op())) {
       LogQuantizationDisabled(transpose_op);
       return;
     }
@@ -609,10 +604,9 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize reshape op";
     GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern);
-    auto* reshape_op_desc = reshape_op->Op();
 
     // skip if should not be quantized
-    if (!reshape_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(reshape_op->Op())) {
       LogQuantizationDisabled(reshape_op);
       return;
     }
@@ -662,10 +656,9 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
                      Graph* g) {
     VLOG(4) << "Quantize matmul op";
     GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
-    auto* matmul_op_desc = matmul_op->Op();
 
     // skip if should not be quantized
-    if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(matmul_op->Op())) {
       LogQuantizationDisabled(matmul_op);
       return;
     }
@@ -732,10 +725,9 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
     VLOG(4) << "Quantize elementwise_add op";
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
                               elementwise_add_pattern);
-    auto* elementwise_add_op_desc = elementwise_add_op->Op();
 
     // skip if should not be quantized
-    if (!elementwise_add_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
       LogQuantizationDisabled(elementwise_add_op);
       return;
     }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 395b419cac13d6d76b6e30579e52a1957b548bab..a66e9f0e9389843267d56d09d3de23add3d34a7f 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -26,7 +26,7 @@ namespace ir {
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
            const std::vector<std::string>& inputs,
            const std::vector<std::string>& outputs, bool use_mkldnn,
-           bool use_quantizer = false) {
+           const std::string& mkldnn_data_type = "float32") {
   auto* op = prog->MutableBlock(0)->AppendOp();
   op->SetType(type);
   op->SetAttr("use_mkldnn", use_mkldnn);
@@ -47,14 +47,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
       op->SetAttr("fuse_residual_connection", false);
     }
     op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_in", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
     op->SetAttr("Scale_weights", std::vector<float>{1.0f});
   } else if (type == "pool2d" || type == "transpose2" || type == "reshape2") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "dropout") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
@@ -63,14 +63,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
     if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_in", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
     op->SetAttr("Scale_weights", std::vector<float>{1.0f});
   } else if (type == "concat") {
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "dequantize") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
@@ -79,7 +79,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
@@ -87,7 +87,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
@@ -142,7 +142,8 @@ static const std::initializer_list<std::string> variable_names{
 // d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j
 //
 // (d,w4, b2)->Conv4->i
-ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+ProgramDesc BuildProgramDesc(bool use_mkldnn,
+                             const std::string& mkldnn_data_type) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
     auto* var = prog.MutableBlock(0)->Var(v);
@@ -152,21 +153,21 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
   }
 
   SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
-        use_quantizer);
-  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+        mkldnn_data_type);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, mkldnn_data_type);
 
   SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
-        use_quantizer);
-  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+        mkldnn_data_type);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, mkldnn_data_type);
 
   SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
   SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn,
-        use_quantizer);
+        mkldnn_data_type);
   SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
-        use_quantizer);
+        mkldnn_data_type);
 
   SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
-        use_quantizer);
+        mkldnn_data_type);
 
   return prog;
 }
@@ -215,7 +216,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
 
 TEST(CpuQuantizePass, quantize) {
   bool use_mkldnn = true;
-  bool use_quantizer = true;
+  std::string mkldnn_data_type = "int8";
   // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
   // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
   //
@@ -228,16 +229,16 @@ TEST(CpuQuantizePass, quantize) {
   // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
   // Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT
   int added_nodes = 8 + 8 + 7 + 7;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 8, 7, added_nodes,
-           2.0f * 127);
+  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 8, 7,
+           added_nodes, 2.0f * 127);
 }
 
 TEST(CpuQuantizePass, do_not_quantize) {
   bool use_mkldnn = true;
-  bool use_quantizer = false;
+  std::string mkldnn_data_type = "float32";
   int added_nodes = 0;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
-           1.0f);
+  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 0, 0,
+           added_nodes, 1.0f);
 }
 
 static const std::initializer_list<std::string> variable_names_concat = {
@@ -250,10 +251,10 @@ static const std::initializer_list<std::string> variable_names_concat = {
 ProgramDesc BuildProgramDescConcat() {
   ProgramDesc prog;
 
-  SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, false);
-  SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, false);
-  SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, true);
-  SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, false);
+  SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, "float32");
+  SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, "float32");
+  SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, "int8");
+  SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, "float32");
 
   return prog;
 }
@@ -321,11 +322,11 @@ ProgramDesc BuildProgramDescTranspose() {
     }
   }
 
-  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, true);
-  SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, true);
-  SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, true);
-  SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, "int8");
+  SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, "int8");
+  SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
@@ -400,8 +401,8 @@ ProgramDesc BuildProgramDescReshape() {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
-  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false);
+  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
 
   return prog;
 }
@@ -415,9 +416,9 @@ ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() {
     prog.MutableBlock(0)->Var(v);
   }
 
-  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, false);
-  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false);
+  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
+  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
 
   return prog;
 }
@@ -505,8 +506,8 @@ ProgramDesc BuildProgramDescMatmul() {
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
@@ -518,8 +519,8 @@ ProgramDesc BuildProgramDescMatmulNotQuantized() {
   }
   SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
   SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
-  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
@@ -590,8 +591,8 @@ ProgramDesc BuildProgramDescElementwiseAdd() {
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
   SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
-        true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+        "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index d570d885c6ebe7a6ff2c7a9047959a2e773541a5..0644cf9bb6575462d2d8362713a4720d2684bf8d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -32,11 +32,16 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
                     n->id()) != excluded_ids_list.end())
         continue;
       auto* op = n->Op();
-      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
-        if (op_types_list.empty()) {
-          op->SetAttr("use_quantizer", true);
-        } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                             op->Type()) != op_types_list.end()) {
+      if (op->HasAttr("mkldnn_data_type") ||
+          op->HasProtoAttr("mkldnn_data_type")) {
+        // use_quantizer is no longer used
+        // assign value for compatibility
+        if (op->GetAttrIfExists<bool>("use_quantizer")) {
+          op->SetAttr("mkldnn_data_type", std::string("int8"));
+        }
+        if (std::find(op_types_list.begin(), op_types_list.end(), op->Type()) !=
+            op_types_list.end()) {
+          op->SetAttr("mkldnn_data_type", std::string("int8"));
           op->SetAttr("use_quantizer", true);
         }
       }
@@ -53,7 +58,10 @@ REGISTER_PASS(cpu_quantize_placement_pass,
     // a vector of operator type names to be quantized ("conv2d" etc.)
     // the second param is the default value for this vector
     .DefaultPassAttr("quantize_enabled_op_types",
-                     new std::unordered_set<std::string>())
+                     new std::unordered_set<std::string>(
+                         {"concat", "conv2d", "elementwise_add", "fc", "matmul",
+                          "pool2d", "prior_box", "relu", "reshape2",
+                          "transpose2"}))
     // a vector of operator ids that are to be excluded from quantization
     // the second param is the default value for this vector
     .DefaultPassAttr("quantize_excluded_op_ids", new std::unordered_set<int>());
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 027d3e2861b506003c4ce1cbe8bd7ee02e0c9c2d..6977a9495853f9aa9a0680cafc51a170b848bb37 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 
 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
+#include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -24,13 +24,11 @@ namespace ir {
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
            const std::vector<std::string>& inputs,
            const std::vector<std::string>& outputs,
-           boost::tribool use_quantizer) {
+           const std::string& mkldnn_data_type = "float32") {
   auto* op = prog->MutableBlock(0)->AppendOp();
 
   op->SetType(type);
-
-  if (!boost::indeterminate(use_quantizer))
-    op->SetAttr("use_quantizer", use_quantizer);
+  op->SetAttr("mkldnn_data_type", mkldnn_data_type);
 
   if (type == "conv2d") {
     op->SetAttr("name", name);
@@ -50,7 +48,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
   op->SetOutput("Out", {outputs[0]});
 }
 
-// operator                      use_quantizer
+// operator                      mkldnn_data_type
 // ---------------------------------------
 // (a,b)->concat->c              none
 // (c,weights,bias)->conv->f     false
@@ -71,19 +69,19 @@ ProgramDesc BuildProgramDesc() {
     }
   }
 
-  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
-  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
-  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
-  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
-  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
-  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, "float32");
+  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, "float32");
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, "float32");
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, "float32");
+  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, "float32");
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, "float32");
 
   return prog;
 }
 
 void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
               std::initializer_list<int> quantize_excluded_op_ids,
-              unsigned expected_use_quantizer_true_count) {
+              unsigned expected_int8_data_type_count) {
   auto prog = BuildProgramDesc();
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
@@ -96,38 +94,34 @@ void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
 
   graph.reset(pass->Apply(graph.release()));
 
-  unsigned use_quantizer_true_count = 0;
+  unsigned int8_data_type_count = 0;
 
   for (auto* node : graph->Nodes()) {
     if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_quantizer") &&
-          BOOST_GET_CONST(bool, op->GetAttr("use_quantizer"))) {
-        ++use_quantizer_true_count;
+      if (platform::HasOpINT8DataType(node->Op())) {
+        ++int8_data_type_count;
       }
     }
   }
 
-  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+  EXPECT_EQ(int8_data_type_count, expected_int8_data_type_count);
 }
 
-void DefaultAttrTest(unsigned expected_use_quantizer_true_count) {
+void DefaultAttrTest(unsigned expected_int8_data_type_count) {
   auto prog = BuildProgramDesc();
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
   auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
   graph.reset(pass->Apply(graph.release()));
 
-  unsigned use_quantizer_true_count = 0;
+  unsigned int8_data_type_count = 0;
   for (auto* node : graph->Nodes()) {
     if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_quantizer") &&
-          BOOST_GET_CONST(bool, op->GetAttr("use_quantizer"))) {
-        ++use_quantizer_true_count;
+      if (platform::HasOpINT8DataType(node->Op())) {
+        ++int8_data_type_count;
       }
     }
   }
-  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+  EXPECT_EQ(int8_data_type_count, expected_int8_data_type_count);
 }
 
 TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
@@ -136,14 +130,14 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
   MainTest({"conv2d"}, {4}, 1);
 }
 
-TEST(QuantizerPlacementPass, excluded_none) {
-  // 2 conv + 2 pool
-  MainTest({}, {}, 4);
+TEST(QuantizerPlacementPass, empty_list) {
+  // no operator quantized
+  MainTest({}, {}, 0);
 }
 
 TEST(QuantizerPlacementPass, default_attr_value) {
-  // 2 conv + 2 pool
-  DefaultAttrTest(4);
+  //  all operators quantized
+  DefaultAttrTest(6);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/subgraph_detector.cc b/paddle/fluid/framework/ir/subgraph_detector.cc
index 62c91af15da60b9b0a74028afb0aeb689073b524..7979953d7be827ffc944ae939782923504802bbc 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
     BriefNode *brief_node = itr.second;
 
     if (!Agent(brief_node->node).marked()) {
-      VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
+      VLOG(4) << brief_node->node->id() << " node named "
+              << brief_node->node->Name() << " is not a trt candidate.";
       continue;
     }
 
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index d46f8a574c0d956dc0a90bc2741d2cb80313ab7f..4307e51862df572e013431fceaaf89cc1cf6679c 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     // CPU, CUDA, PLAIN are same library type.
   } else if (s == std::string("CPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("XPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
     return LibraryType::kPlain;
   } else {
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 4ae26903e66c521f26eb3514622f03f7338c64e1..030e80c0b3fa12ea2dd8f0dcc676a42ef68db3ea 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     workers_[i]->SetRootScope(root_scope_);
     workers_[i]->CreateDeviceResource(main_program);  // Program
     workers_[i]->BindingDataFeedMemory();
+    workers_[i]->CacheProgram(main_program);
   }
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 66fe71a80a7b0165a0d4afb38c89fc1fdb339190..bccc92e5c4352927f309f3605bb3c8d8dd823bb5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_desc.h"
+
 #include <algorithm>
 #include <functional>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_call_stack.h"
@@ -51,23 +53,62 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   std::vector<std::string> Outputs(const std::string &name) const override;
 
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto &op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto &op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
   void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+
     std::string input_n = Inputs(in)[i];
     std::string output_n = Outputs(out)[j];
 
-    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
-                   in, i);
-    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_NE(input_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(output_n, framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
 
     auto *in_var = block_.FindVarRecursive(input_n);
     auto *out_var = block_.FindVarRecursive(output_n);
 
-    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
-                   "The type of %s and %s is not the same.", input_n, output_n);
+    PADDLE_ENFORCE_EQ(
+        in_var->GetType(), out_var->GetType(),
+        platform::errors::InvalidArgument(
+            "The type of input %s and output %s do not match. The input type "
+            "is %s, output type is %s.",
+            input_n, output_n, DataTypeToString(in_var->GetType()),
+            DataTypeToString(out_var->GetType())));
 
     SetDim(output_n, GetDim(input_n));
   }
@@ -101,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", in, i);
-    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
-                   "The %s[%d] is @EMPTY@", out, j);
+    PADDLE_ENFORCE_LT(i, Inputs(in).size(),
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Inputs(in).size(), i));
+    PADDLE_ENFORCE_LT(j, Outputs(out).size(),
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, expected "
+                          "index less than %d, but received index is %d.",
+                          Outputs(out).size(), j));
+    PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] is empty.", in, i));
+    PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] is empty.", out, j));
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
@@ -119,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size(),
-                      "Input %s of operator %s only has %d elements.", in,
-                      op_.Type(), Inputs(in).size());
+                      platform::errors::InvalidArgument(
+                          "The input variable index is out of range, input "
+                          "variable %s of operator %s only has %d elements.",
+                          in, op_.Type(), Inputs(in).size()));
     PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
-                      "Input %s[%d] of operator %s is @EMPTY@", in, op_.Type(),
-                      i);
+                      platform::errors::InvalidArgument(
+                          "The input variable %s[%d] of operator %s is empty.",
+                          in, i, op_.Type()));
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     PADDLE_ENFORCE_NOT_NULL(
-        in_var, "Input %s[%d] of operator %s should not be nullptr.", in,
-        op_.Type(), i);
+        in_var, platform::errors::NotFound(
+                    "The input variable %s[%d] of operator %s is not found.",
+                    in, i, op_.Type()));
     return in_var->GetLoDLevel();
   }
 
   void SetLoDLevel(const std::string &out, int32_t lod_level,
                    size_t j = 0) const override {
     PADDLE_ENFORCE_LT(j, Outputs(out).size(),
-                      "Output %s of operator %s only has %d elements.", out,
-                      op_.Type(), Outputs(out).size());
+                      platform::errors::InvalidArgument(
+                          "The output variable index is out of range, output "
+                          "variable %s of operator %s only has %d elements.",
+                          out, op_.Type(), Outputs(out).size()));
     PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
-                      "Output %s[%d] of operator %s is @EMPTY@", out,
-                      op_.Type(), j);
+                      platform::errors::InvalidArgument(
+                          "The output variable %s[%d] of operator %s is empty.",
+                          out, j, op_.Type()));
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     PADDLE_ENFORCE_NOT_NULL(
-        out_var, "Output %s[%d] of operator %s should not be nullptr.", out,
-        op_.Type(), j);
+        out_var, platform::errors::NotFound(
+                     "The output variable %s[%d] of operator %s is not found.",
+                     out, j, op_.Type()));
     if (lod_level >= 0) {
       out_var->SetLoDLevel(lod_level);
     }
@@ -175,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   DDim GetInputDim(const std::string &name) const override {
     const std::vector<std::string> &arg_names = Inputs(name);
     PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
+                      platform::errors::InvalidArgument(
+                          "The input(%s) should hold only one element, but now "
+                          "it holds %d elements.",
+                          name, arg_names.size()));
     return this->GetDim(arg_names[0]);
   }
 
@@ -200,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   void SetOutputDim(const std::string &name, const DDim &dim) override {
     auto arg_names = Outputs(name);
     PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, arg_names.size());
+                      platform::errors::InvalidArgument(
+                          "The iutput(%s) should hold only one element, but "
+                          "now it holds %d elements.",
+                          name, arg_names.size()));
     SetDim(arg_names[0], dim);
   }
 
@@ -227,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   DDim GetDim(const std::string &name) const {
     auto var = block_.FindVarRecursive(name);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found.", name));
     DDim res;
     try {
       auto shape = var->GetShape();
@@ -253,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   void SetDims(const std::vector<std::string> &names,
                const std::vector<DDim> &dims) {
     size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The input variables number(%d) and input dimensions "
+                          "number(%d) do not match.",
+                          length, dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (names[i] == framework::kEmptyVarName) {
         continue;
@@ -339,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() {
 
 const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
   auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
-                 Type());
+  PADDLE_ENFORCE_NE(
+      it, inputs_.end(),
+      platform::errors::NotFound("Input %s cannot be found in operator %s.",
+                                 name, Type()));
   return it->second;
 }
 
@@ -360,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name,
 
 const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
-                 name, Type());
+  PADDLE_ENFORCE_NE(
+      it, outputs_.end(),
+      platform::errors::NotFound("Output %s cannot be found in operator %s.",
+                                 name, Type()));
   return it->second;
 }
 
@@ -402,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
 
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
   return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
@@ -467,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
         return;
       }
       default:
-        PADDLE_THROW("Wrong attr type %d", attr.type());
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported attribute type (code %d).", attr.type()));
     }
     need_update_ = true;
     return;
@@ -504,7 +578,8 @@ void OpDesc::SetAttrMap(
 
 Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
+                                          "Attribute %s is not found.", name));
   return it->second;
 }
 
@@ -518,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr(
     }
   }
 
-  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+  PADDLE_THROW(platform::errors::NotFound(
+      "Attribute %s is not found in proto %s.", name, proto.type()));
 }
 
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
@@ -532,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
 
 std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
   auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second);
 
   std::vector<int> ids;
@@ -545,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
 
 int OpDesc::GetBlockAttrId(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound(
+          "Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
   return BOOST_GET_CONST(BlockDesc *, it->second)->ID();
 }
 
@@ -632,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     VectorToRepeated(v, attr_->mutable_longs());
   }
 
-  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method of SetAttrDescVisitor object for "
+        "`boosst::blank` type."));
+  }
 };
 
 void OpDesc::Flush() {
@@ -666,8 +752,9 @@ void OpDesc::Flush() {
 }
 
 void OpDesc::CheckAttrs() {
-  PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is set.");
+  PADDLE_ENFORCE_EQ(Type().empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "CheckAttrs() can not be called before type is set."));
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
@@ -682,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const {
   try {
     VLOG(3) << "CompileTime infer shape on " << Type();
     auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
-    PADDLE_ENFORCE(static_cast<bool>(infer_shape),
-                   "%s's infer_shape has not been registered", this->Type());
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(infer_shape), true,
+        platform::errors::NotFound(
+            "Operator %s's infer_shape is not registered.", this->Type()));
     CompileTimeInferShapeContext ctx(*this, block);
     if (VLOG_IS_ON(10)) {
       std::ostringstream sout;
@@ -733,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
+                                     "Input(%s) should have only one value, "
+                                     "but it has %d values now.",
+                                     name, length));
   return block_.HasVarRecursive(input_names[0]);
 }
 
@@ -749,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output(%s) should have only one value, "
-                    "but it have %d now",
-                    name, length);
+  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
+                                     "Output(%s) should have only one value, "
+                                     "but it has %d values now.",
+                                     name, length));
   return block_.HasVarRecursive(output_names[0]);
 }
 
@@ -801,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs(
 std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
     const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<DDim> res;
   try {
     auto shapes = var->GetShapes();
@@ -823,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 void CompileTimeInferShapeContext::SetRepeatedDims(
     const std::string &name, const std::vector<DDim> &dims) {
   auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<std::vector<int64_t>> dim_vec(dims.size());
   std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
   var->SetShapes(dim_vec);
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 0f842637a58e0897e8b68fe06d1e712ffd20ad97..d8159d6a5c294b85d8d5ab9bbee3b95a5eba793f 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                               customized_name,                     \
                               customized_type_value,               \
@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11b7224e683402264573019e1541c5645a3a7514
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a85c60305bd36e78c071f5703885c23e33b403e
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <boost/any.hpp>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+struct OpUpdateRecord {
+  enum class Type {
+    kInvalid = 0,
+    kModifyAttr,
+    kNewAttr,
+    kNewInput,
+    kNewOutput
+  };
+  Type type_;
+  std::string remark_;
+};
+
+struct ModifyAttr : OpUpdateRecord {
+  ModifyAttr(const std::string& name, const std::string& remark,
+             const boost::any& default_value)
+      : OpUpdateRecord({Type::kModifyAttr, remark}),
+        name_(name),
+        default_value_(default_value) {
+    // TODO(Shixiaowei02): Check the data type with proto::OpDesc.
+  }
+
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+
+struct NewAttr : OpUpdateRecord {
+  NewAttr(const std::string& name, const std::string& remark,
+          const boost::any& default_value)
+      : OpUpdateRecord({Type::kNewAttr, remark}),
+        name_(name),
+        default_value_(default_value) {}
+
+ private:
+  std::string name_;
+  boost::any default_value_;
+};
+
+struct NewInput : OpUpdateRecord {
+  NewInput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+struct NewOutput : OpUpdateRecord {
+  NewOutput(const std::string& name, const std::string& remark)
+      : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
+
+ private:
+  std::string name_;
+};
+
+class OpVersionDesc {
+ public:
+  OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
+                            boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::ModifyAttr(name, remark, default_value)));
+    return *this;
+  }
+
+  OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
+                         boost::any default_value) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewAttr(name, remark, default_value)));
+    return *this;
+  }
+
+  OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewInput(name, remark)));
+    return *this;
+  }
+
+  OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::NewOutput(name, remark)));
+    return *this;
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
+};
+
+class OpVersion {
+ public:
+  OpVersion& AddCheckpoint(const std::string& note,
+                           const OpVersionDesc& op_version_desc) {
+    checkpoints_.push_back(Checkpoint({note, op_version_desc}));
+    return *this;
+  }
+
+ private:
+  struct Checkpoint {
+    std::string note_;
+    OpVersionDesc op_version_desc_;
+  };
+  std::vector<Checkpoint> checkpoints_;
+};
+
+class OpVersionRegistrar {
+ public:
+  static OpVersionRegistrar& GetInstance() {
+    static OpVersionRegistrar instance;
+    return instance;
+  }
+  OpVersion& Register(const std::string& op_type) {
+    if (op_version_map_.find(op_type) != op_version_map_.end()) {
+      PADDLE_THROW("'%s' is registered in operator version more than once.",
+                   op_type);
+    }
+    op_version_map_.insert({op_type, OpVersion()});
+    return op_version_map_[op_type];
+  }
+
+ private:
+  std::unordered_map<std::string, OpVersion> op_version_map_;
+
+  OpVersionRegistrar() = default;
+  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
+};
+
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
+
+#define REGISTER_OP_VERSION(op_type)                                       \
+  static paddle::framework::compatible::OpVersion                          \
+      RegisterOpVersion__##op_type =                                       \
+          paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
+              .Register(#op_type)
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..052bf3a4b882be749e70704f18f09a7b24551ed7
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+TEST(test_operator_version, test_operator_version) {
+  REGISTER_OP_VERSION(test__)
+      .AddCheckpoint(
+          R"ROC(
+        Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .ModifyAttr("axis",
+                          "Increased from the original one method to two.", -1)
+              .NewAttr("size",
+                       "In order to represent a two-dimensional rectangle, the "
+                       "parameter size is added.",
+                       0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a new attribute [height]
+      )ROC",
+          framework::compatible::OpVersionDesc().NewAttr(
+              "height",
+              "In order to represent a two-dimensional rectangle, the "
+              "parameter height is added.",
+              0))
+      .AddCheckpoint(
+          R"ROC(
+        Add a input [X2] and a output [Y2]
+      )ROC",
+          framework::compatible::OpVersionDesc()
+              .NewInput("X2", "The second input.")
+              .NewOutput("Y2", "The second output."));
+}
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9c293bcdb852ff1ab5b1494838ee2c947cd372cc..ca2705f154c4f45dfccd954b23209c71701adce5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/operator.h"
+
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
@@ -20,18 +22,21 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -163,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::SetDeviceId(dev_id);
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Cannot run operator on place %s", place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::SetXPUDeviceId(dev_id);
 #endif
     }
 
@@ -604,6 +617,29 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return op_.Outputs(name);
   }
 
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
@@ -1084,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing XPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 709f132813c7da23bc2ab77f7cfb586d4d11edbf..ebecbf0498c384a55627e2b5cb31304d098a444c 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
-/// Variables with this suffix are the loaded from pre-train model.
-constexpr char kLoadedVarSuffix[] = "@LOADED";
-
 /// RuntimeContext is used to relate input/output names of Operator with
 /// the corresponding variables in name scope.
 /// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8c6dd628bb9748bb120c1c39841e199659fb53fc..12e0f97f1262ca0f6bf8fc70ab5b482fb0bdd305 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "XPU is not supported in ParallelExecutor"));
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
   member_->use_cuda_ = exec_strategy.use_cuda_;
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 919378c929185b12826c8b427d0e9a86a382bb2b..274b0ca0d903d4e89c7bceb74bc16581f03bb584 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
       should_run.push_back(true);
     } else {
       should_run.push_back(false);
+      // If the output of an op modifies feed vars, the op should not clip.
+      // For example, in the transformer structure, the third parameter returned
+      // by beam_search op is generally assigned to a feed var. Cutting the
+      // assign op will cause an error.
+      if (parent_block_id != -1) {
+        bool flag = false;
+        for (auto& var : op_desc.outputs()) {
+          for (auto& argu : var.arguments()) {
+            if (feed_var_names.count(argu)) {
+              flag = true;
+            }
+          }
+        }
+        if (flag) {
+          should_run.back() = true;
+        }
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index eb5c241a8372a460483c70e38f962168b1cdbbc0..12fa0c61f8121d475a0cf2aa78e4bb995a01b132 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) {
   EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
   EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
 }
+
+// If the output of an op modifies feed vars, the op should not clip.
+TEST(Prune, recurrrent_op_2) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::BlockDesc *sub_block = program.AppendBlock(*block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+
+  std::vector<std::string> state_var_name(1, "y");
+  AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
+        {{"ex_states", state_var_name},
+         {"states", state_var_name},
+         {"sub_block", sub_block}},
+        block);
+
+  EXPECT_TRUE(sub_block != nullptr);
+  AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"a"}}},
+        f::AttributeMap{}, sub_block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  std::set<std::string> feed_var_names = {"x", "a"};
+
+  f::Prune(*pdesc, feed_var_names, &pruned);
+  EXPECT_EQ(pruned.blocks_size(), 2);
+  EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
+  EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
+}
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index d66e1d8062e0b3fa91e9e3de3290806c4397bff2..9f4c817db7d81ff6d8a9afb9d5cee7b4f1dd0ed2 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -56,6 +56,34 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
     current_version_[tid] = 0;
   }
   fleet_ptr_ = FleetWrapper::GetInstance();
+#ifdef PADDLE_WITH_CUDA
+  copy_streams_.clear();
+  places_.clear();
+  thread_scopes_.clear();
+#endif
+}
+
+void PullDenseWorker::CreatePinVar() {
+#ifdef PADDLE_WITH_CUDA
+  // for (auto& v : dense_value_names_) {
+  //  for (auto& name : v.second) {
+  for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
+       ++i) {
+    uint64_t tid = static_cast<uint64_t>(
+        dwp_param_.program_config(0).pull_dense_table_id(i));
+    for (size_t j = 0; j < dense_value_names_[tid].size(); j++) {
+      auto& name = dense_value_names_[tid][j];
+      Variable* var = root_scope_->FindVar(name);
+
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      auto* ptr = root_scope_->Var(name + "pin");
+      InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+      LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
+      pin_tensor->mutable_data<float>(tensor->dims(),
+                                      platform::CUDAPinnedPlace());
+    }
+  }
+#endif
 }
 
 void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
@@ -75,6 +103,31 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
     exit(-1);
   }
   status_vec->resize(0);
+#ifdef PADDLE_WITH_CUDA
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    // for (auto& v : dense_value_names_) {
+    //  for (auto& name : v.second) {
+    for (int x = 0; x < dwp_param_.program_config(0).pull_dense_table_id_size();
+         ++x) {
+      uint64_t tid = static_cast<uint64_t>(
+          dwp_param_.program_config(0).pull_dense_table_id(x));
+      for (size_t j = 0; j < dense_value_names_[tid].size(); j++) {
+        auto& name = dense_value_names_[tid][j];
+
+        Variable* pin_var = root_scope_->FindVar(name + "pin");
+        LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+        float* pin_w = pin_tensor->data<float>();
+        Variable* var = thread_scopes_[i]->FindVar(name);
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        float* w = tensor->data<float>();
+        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
+                     platform::CUDAPinnedPlace(), pin_w,
+                     sizeof(float) * tensor->numel(), copy_streams_[i]);
+      }
+    }
+  }
+#endif
 }
 
 void PullDenseWorker::Stop() {
@@ -91,8 +144,14 @@ void PullDenseWorker::PullDense(bool force_update) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
+#ifdef PADDLE_WITH_CUDA
+      VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
-                                     &pull_dense_status_);
+                                     &pull_dense_status_, false);
+#else
+      fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
+                                     &pull_dense_status_, true);
+#endif
       ResetThreadVersion(tid);
     }
   }
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 7ce8deb7cfc70d39de52e1fd9e5bace969f854e7..8d8a8f01b3f38c82a480bf7204721481586cc860 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -52,7 +53,8 @@ class InferShapeContext {
                              const std::vector<DDim> &dims) = 0;
   virtual void SetReaderDims(const std::string &name,
                              const std::vector<DDim> &dims);
-
+  virtual std::string GetInputNameByIdx(size_t idx) const = 0;
+  virtual std::string GetOutputNameByIdx(size_t idx) const = 0;
   virtual AttrReader Attrs() const = 0;
   virtual std::vector<std::string> Inputs(const std::string &name) const = 0;
   virtual std::vector<std::string> Outputs(const std::string &name) const = 0;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 50637a0c3d3f9c6975578e94e6ddc2c898c926e0..3b3271fc5b936e65b60930f43ea5c4f6f8448941 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -54,14 +54,43 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                  size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -71,8 +100,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -82,8 +112,32 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
+    auto dst_cuda_pinned_place =
+        BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Device context place mismatch. When copying Tensor "
+                          "data from GPU memory to CUDA Pinned memory, current "
+                          "device context place should be GPU."));
+    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
+                      platform::errors::PreconditionNotMet(
+                          "The source GPU device and current device context do "
+                          "not match. The source GPU device number is %d, but "
+                          "device context GPU number is %d.",
+                          src_gpu_place.device, ctx_gpu_place.device));
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cuda_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
@@ -104,8 +158,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size,
                  stream);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
@@ -128,7 +183,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
       }
     }
-  } else {
+  }
+  else {  // NOLINT
     PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
@@ -174,35 +230,74 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_XPU
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
+             platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
                  size);
-  } else if (platform::is_gpu_place(src_place) &&  // NOLINT
-             platform::is_cpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
-  } else if (platform::is_cuda_pinned_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
+  }
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_gpu_place(dst_place)) {
     auto src_pinned_place =
         BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
                  nullptr);
-  } else {
+  }
+  else {  // NOLINT
     PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
   }
 #endif
@@ -241,6 +336,19 @@ class AnyVisitor : public boost::static_visitor<bool> {
   const framework::Tensor& tensor_;
   Predicate predicate_;
 
+  bool GetResultHelper(const framework::Tensor& out,
+                       const platform::Place& place) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto ctx = platform::DeviceContextPool::Instance().Get(place);
+    ctx->Wait();
+    TensorCopy(out, cpu, *ctx, &tmp);
+    ctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
  public:
   AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
       : tensor_(tensor), predicate_(std::move(predicate)) {}
@@ -255,17 +363,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
     return this->GetResult(out, place);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::XPUPlace& xpu) const {
+    return GetResultHelper(out, xpu);
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    TensorCopy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
+    return GetResultHelper(out, gpu);
   }
 
   bool GetResult(const framework::Tensor& out,
@@ -315,6 +420,61 @@ inline void Any(const framework::Tensor& tensor, Predicate predicate,
   platform::VisitPlace(place, visitor);
 }
 
+template <typename Predicate, typename DevCtx>
+struct AllDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AllDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void apply() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenVector<bool>::Flatten(*out_);
+    o.device(*ctx_.eigen_device()) = predicate_(t);
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AllImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(tensor.type(), AllDTypeVisitor<Predicate, DevCtx>(
+                                   predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+class AllOutVisitor : public boost::static_visitor<> {
+ private:
+  const framework::Tensor& tensor_;
+  mutable framework::Tensor* out_;
+  Predicate predicate_;
+
+ public:
+  AllOutVisitor(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out)
+      : tensor_(tensor), out_(out), predicate_(predicate) {}
+
+  template <typename Place>
+  void operator()(const Place& place) const {
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    out_->Resize(tensor_.dims());
+    out_->mutable_data<bool>(place);
+    AllImpl(predicate_, tensor_, *ctx, out_);
+  }
+};
+
+template <typename Predicate>
+inline void All(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out) {
+  AllOutVisitor<Predicate> visitor(tensor, predicate, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 struct ContainsNANPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -335,6 +495,12 @@ void TensorContainsNAN(const framework::Tensor& tensor,
   Any(tensor, predicate, out);
 }
 
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out) {
+  ContainsNANPredicate predicate;
+  All(tensor, predicate, out);
+}
+
 struct ContainsInfPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -355,6 +521,12 @@ void TensorContainsInf(const framework::Tensor& tensor,
   Any(tensor, predicate, out);
 }
 
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out) {
+  ContainsInfPredicate predicate;
+  All(tensor, predicate, out);
+}
+
 // NOTE(dzhwinter):
 // Isfinite need a AllVisitor to loop through all the elements.
 // We choose two cuda call instead of one allvisitor. The AllVisitor
@@ -367,8 +539,8 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
 
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
-static inline void __global__ BothFalse(const T* cmp, T* out) {
-  out[0] = (!cmp[0]) && (!out[0]);
+static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
+  CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
 }
 #endif
 
@@ -383,25 +555,47 @@ struct BothFalseVisitor : public boost::static_visitor<> {
     VisitorImpl(place);
   }
 
+  void VisitorImpl(const platform::XPUPlace& xpu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+  }
+
   void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #ifdef PADDLE_WITH_CUDA
     auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
-    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
-                                                out_->mutable_data<bool>(gpu));
+    constexpr int MAX_BLOCK_DIM = 512;
+    const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int element_num = in_.numel();
+    int block_size = (element_num >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(element_num)));
+    int grid_size = element_num / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    BothFalse<bool><<<grid_size, block_size, 0, ctx->stream()>>>(
+        in_.data<bool>(), out_->mutable_data<bool>(gpu), element_num);
 #endif
   }
 
   void VisitorImpl(const platform::CPUPlace& cpu) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
   }
 
   void VisitorImpl(
       const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
-    bool lhs = !in_.data<bool>()[0];
-    bool rhs = !out_->mutable_data<bool>(cpu)[0];
-    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
   }
 };
 
@@ -414,6 +608,15 @@ void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
   platform::VisitPlace(place, visitor);
 }
 
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out) {
+  framework::Tensor tmp;
+  TensorContainsInfV2(tensor, &tmp);
+  TensorContainsNANV2(tensor, out);
+  BothFalseVisitor visitor(tmp, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                     const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -463,6 +666,28 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported when not compiled with CUDA"));
+#endif
+    } else if (platform::is_xpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_XPU
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& xpu_dev_ctx =
+          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::XPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write);
+        xpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "XPUPlace is not supported when not compiled with XPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -517,8 +742,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     void* buf;
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -528,8 +754,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
     } else {
       framework::VisitDataType(
@@ -568,8 +799,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     void* buf;
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
+        platform::is_xpu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -579,8 +811,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported when not compiled with CUDA"));
+      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "CUDAPlace is not supported when not compiled with CUDA"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "XPUPlace is not supported when not compiled with XPU"));
+      }
 #endif
     } else {
       framework::VisitDataType(
@@ -665,6 +902,9 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
         reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+#endif
 }
 
 template <typename T>
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index c71327da64042aed85f1247f3c31de3e66a588ba..fce0142b41d3ae9b2a6fcd4f16d38b0492fbd806 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
                       const platform::DeviceContext& dev_ctx,
                       const size_t& seek, const std::vector<int64_t>& shape);
 
+// store the bool result tensor in out tensor
+void TensorContainsNANV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorContainsInfV2(const framework::Tensor& tensor,
+                         framework::Tensor* out);
+void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
+
 // convert dlpack's DLTensor to tensor
 void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
 
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 5c4c511b2bb530f3bc711c24addd56063f99992f..1f97024d97059e2924e896cb0584e40be61e4ddc 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -21,9 +21,12 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include <ctime>
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/framework/heter_service.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/reader.h"
@@ -62,6 +65,7 @@ class TrainerBase {
   Scope* root_scope_;
   bool debug_;
   Dataset* dataset_ptr_;
+  TrainerDesc trainer_desc_;
 
   // For dump param or field
   bool need_dump_field_ = false;
@@ -118,10 +122,86 @@ class DistMultiTrainer : public MultiTrainer {
   void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
   virtual void InitDumpEnv();
   virtual Scope* GetWorkerScope(int thread_id);
+  virtual void RegisterHeterCallback();
+
+ protected:
+  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
+};
+
+#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+class HeterServiceContext {
+ public:
+  HeterServiceContext() {}
+  virtual ~HeterServiceContext() {
+    for (OperatorBase* op : ops_) {
+      delete op;
+    }
+    std::vector<OperatorBase*>().swap(ops_);
+  }
+  void Reset() { push_dense_status_.clear(); }
+  int place_num_;
+  Scope* scope_{nullptr};
+  cudaEvent_t event_;
+  std::vector<OperatorBase*> ops_;
+  std::vector<::std::future<int32_t>> push_dense_status_;
+};
+
+class HeterXpuTrainer : public TrainerBase {
+ public:
+  HeterXpuTrainer() {}
+  virtual ~HeterXpuTrainer() {
+    for (OperatorBase* op : ops_) {
+      delete op;
+    }
+    std::vector<OperatorBase*>().swap(ops_);
+  }
+  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
+  virtual void InitTrainerEnv(const ProgramDesc& main_program,
+                              const platform::Place& place);
+  virtual void InitOtherEnv(const ProgramDesc& main_program);
+  virtual void Run();
+  virtual void Finalize();
+  virtual void DumpWork(int tid);
+  virtual void RegisterServiceHandler();
+  virtual int RunTask(const HeterRequest* request, HeterResponse* response);
+  virtual Scope* GetWorkerScope(int thread_id);
+  virtual void CacheProgram(const ProgramDesc& main_program) {
+    new (&program_) ProgramDesc(main_program);
+  }
+  template <typename T>
+  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
+                   const paddle::platform::Place& thread_place,
+                   cudaStream_t stream);
+  void CreateThreadParam(const ProgramDesc& program, int num);
+  template <typename T>
+  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
+  int EndPass(const HeterRequest* request, HeterResponse* response);
+  int StopService(const HeterRequest* request, HeterResponse* response);
 
  protected:
+  DownpourWorkerParameter param_;
+  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
+  std::vector<std::string> need_merge_var_names_;
+  float scale_datanorm_;
+  int xpu_begin_op_index_;
+  int xpu_end_op_index_;
+  bool running_;
+  paddle::platform::Place place_;
+  std::mutex mutex_;
+  ProgramDesc program_;
+  std::condition_variable cond_;
+  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
+  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
+  std::vector<OperatorBase*> ops_;
+  std::vector<std::string> op_names_;
+  std::vector<Scope*> place_scopes_;
+  BtObjectPool<HeterServiceContext> object_pool_;
+  std::vector<cudaStream_t> copy_streams_;
+  std::vector<platform::Place> places_;
+  std::vector<cudaEvent_t> events_;
 };
+#endif
 
 #if defined(PADDLE_WITH_NCCL)
 class PipelineTrainer : public TrainerBase {
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 670ae074c7c7f0e3bcd91e157ba7b01b48d3b7ee..1985742fc4aa6a0fc67f552f2b69902840a00d0f 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -52,6 +52,12 @@ message TrainerDesc {
   optional bool enable_random_dump = 24 [ default = false ];
   optional bool random_with_lineid = 25 [ default = false ];
   optional int32 dump_interval = 26 [ default = 10000 ];
+  repeated int32 worker_places = 27;
+
+  repeated string xpu_send_list = 28;
+  repeated string xpu_recv_list = 29;
+  optional int32 xpu_start_idx = 30;
+  optional int32 xpu_end_idx = 31;
 
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 23cfa11d4c9b2ef8542cd318970a58cad84e662d..31ac11e78cff15f12660b84b96a007690aa77ae3 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,6 +63,9 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
+#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+REGISTER_TRAINER_CLASS(HeterXpuTrainer);
+#endif
 #if defined(PADDLE_WITH_NCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index eee100bc81c337942fe9e051f63155e8b07c1cb8..e7e964b41818597404d9a6e510c0743f4f8c3f7e 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -28,38 +28,6 @@ DEFINE_bool(enable_unused_var_check, false,
             "Checking whether operator contains unused inputs, "
             "especially for grad operator. It should be in unittest.");
 
-// NOTE(zhiqiu): Currently, there are some operators which involves unused
-// inputs and cannot be removed from the allow_list below.
-// They can be mainly divided into four categories:
-// 0: the inputs of which are only used in if branch, or used in cuda kernel but
-// not in cpu kernel;
-// 1: the inputs of which are used to indicate dtype of outputs;
-// 2: the inputs of which are used in fused operators.
-// The category number is presented in the comments after each operator.
-
-const std::unordered_set<std::string> op_with_unsed_vars_allow_list = {
-    "batch_norm",                      // 0
-    "batch_norm_grad",                 // 0
-    "sync_batch_norm",                 // 0
-    "sync_batch_norm_grad",            // 0
-    "inplace_abn",                     // 0
-    "inplace_abn_grad",                // 0
-    "dgc_momentum",                    // 0
-    "fake_quantize_range_abs_max",     // 0
-    "rmsprop",                         // 0
-    "sequence_conv_grad",              // 0
-    "roi_perspective_transform_grad",  // 0
-    "fill_zeros_like",                 // 1
-    "fill_any_like",                   // 1
-    "nce_grad",                        // 1
-    "precision_recall",                // 1
-    "fusion_seqpool_cvm_concat",       // 2
-    "fused_batch_norm_act",            // 2
-    "fused_batch_norm_act_grad",       // 2
-    "data_norm",                       // 0
-    "data_norm_grad",                  // 0
-};
-
 namespace paddle {
 namespace framework {
 
@@ -75,9 +43,44 @@ void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) {
   }
 }
 
+static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
+  // NOTE(zhiqiu): Currently, there are some operators which involves unused
+  // inputs and cannot be removed from the allow_list below.
+  // They can be mainly divided into four categories:
+  // 0: the inputs of which are only used in if branch, or used in cuda kernel
+  // but not in cpu kernel; 1: the inputs of which are used to indicate dtype of
+  // outputs; 2: the inputs of which are used in fused operators. The category
+  // number is presented in the comments after each operator.
+  // Use pointer here for safe static deinitialization
+  static auto *allow_set = new std::unordered_set<std::string>({
+      // called once
+      "batch_norm",                      // 0
+      "batch_norm_grad",                 // 0
+      "sync_batch_norm",                 // 0
+      "sync_batch_norm_grad",            // 0
+      "inplace_abn",                     // 0
+      "inplace_abn_grad",                // 0
+      "dgc_momentum",                    // 0
+      "fake_quantize_range_abs_max",     // 0
+      "rmsprop",                         // 0
+      "sequence_conv_grad",              // 0
+      "roi_perspective_transform_grad",  // 0
+      "fill_zeros_like",                 // 1
+      "fill_any_like",                   // 1
+      "nce_grad",                        // 1
+      "precision_recall",                // 1
+      "fusion_seqpool_cvm_concat",       // 2
+      "fused_batch_norm_act",            // 2
+      "fused_batch_norm_act_grad",       // 2
+      "data_norm",                       // 0
+      "data_norm_grad",                  // 0);
+  });
+  return *allow_set;
+}
+
 void CheckUnusedVar(const OperatorBase &op, const Scope &scope) {
   // skip op in allow list.
-  if (op_with_unsed_vars_allow_list.count(op.Type()) != 0) {
+  if (GetOpWithUnusedVarAllowSet().count(op.Type()) != 0) {
     return;
   }
   auto *used_set = GetThreadLocalUsedVarNameSet();
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 4d602d5c0211e221a99e0e87a3344c5a9c2a0142..3d01e4fe46f10f1c9494026ca1cb21496ed6fe6b 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) 
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
 add_subdirectory(jit)
-
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
+cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c980b014b823e21f117bc6e44037349b06a1fdfd
--- /dev/null
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/variable_wrapper.h"
+
+namespace paddle {
+namespace imperative {
+
+AmpOperators::AmpOperators()
+    : allow_ops_(new std::unordered_set<std::string>()),
+      block_ops_(new std::unordered_set<std::string>()) {}
+AmpOperators::~AmpOperators() {}
+
+AmpOperators& AmpOperators::Instance() {
+  static AmpOperators instance;
+  return instance;
+}
+
+std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetAllowOps() {
+  return allow_ops_;
+}
+
+std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetBlockOps() {
+  return block_ops_;
+}
+
+inline std::string GetDtypeStr(
+    const std::shared_ptr<imperative::VarBase>& var) {
+  return framework::DataTypeToString(var->DataType());
+}
+
+inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
+  if (!platform::is_gpu_place(var->Place())) {
+    return false;
+  }
+  if (var->DataType() == framework::proto::VarType::FP32 ||
+      var->DataType() == framework::proto::VarType::FP16) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
+// var will be cast back from fp16 to fp32 during backward phase.
+static inline std::shared_ptr<imperative::VarBase> CastToType(
+    const std::shared_ptr<VarBase>& var,
+    const framework::proto::VarType::Type dst_type) {
+  const auto& tracer = imperative::GetCurrentTracer();
+  imperative::NameVarBaseMap ins = {{"X", {var}}};
+  framework::AttributeMap attrs = {{"in_dtype", var->DataType()},
+                                   {"out_dtype", dst_type}};
+  auto out = std::shared_ptr<imperative::VarBase>(
+      new imperative::VarBase(tracer->GenerateUniqueName()));
+  imperative::NameVarBaseMap outs = {{"Out", {out}}};
+
+  {
+    AutoCastGuard guard(tracer, false);
+    tracer->TraceOp("cast", ins, outs, std::move(attrs));
+  }
+
+  return out;
+}
+
+static inline std::shared_ptr<imperative::VarBase> CastToFP16(
+    const std::shared_ptr<VarBase>& var) {
+  auto dst_type = framework::proto::VarType::FP16;
+  if (NeedCast(var) && (var->DataType() != dst_type)) {
+    return CastToType(var, dst_type);
+  }
+  return var;
+}
+
+static inline std::shared_ptr<imperative::VarBase> CastToFP32(
+    const std::shared_ptr<VarBase>& var) {
+  auto dst_type = framework::proto::VarType::FP32;
+  if (NeedCast(var) && (var->DataType() != dst_type)) {
+    return CastToType(var, dst_type);
+  }
+  return var;
+}
+
+static inline framework::proto::VarType::Type GetPromoteType(
+    const NameVarBaseMap& ins) {
+  auto dst_type = framework::proto::VarType::FP16;
+  for (const auto& pair : ins) {
+    for (const auto& var : pair.second) {
+      if (var->DataType() == framework::proto::VarType::FP32) {
+        dst_type = var->DataType();
+        break;
+      }
+    }
+  }
+  return dst_type;
+}
+
+NameVarBaseMap AutoCastInputs(const std::string& op_type,
+                              const NameVarBaseMap& ins) {
+  NameVarBaseMap new_ins = {};
+  if (AmpOperators::Instance().GetAllowOps()->count(op_type)) {
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to float16";
+      for (const auto& var : pair.second) {
+        auto new_var = CastToFP16(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  } else if (AmpOperators::Instance().GetBlockOps()->count(op_type)) {
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to float";
+      for (const auto& var : pair.second) {
+        auto new_var = CastToFP32(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  } else {
+    auto dst_type = GetPromoteType(ins);
+
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to "
+              << framework::DataTypeToString(dst_type);
+      for (const auto& var : pair.second) {
+        // NOTE(zhiqiu): Conv + BN always occur together, we needn't
+        // cast X of batch_norm to FP32, which is produced by conv as FP16 type.
+        if (op_type == "batch_norm" && pair.first == "X" &&
+            dst_type == framework::proto::VarType::FP32) {
+          new_ins[pair.first].emplace_back(var);
+          continue;
+        }
+        auto new_var = dst_type == framework::proto::VarType::FP32
+                           ? CastToFP32(var)
+                           : CastToFP16(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  }
+  return ins;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1da97e5a39057aed3ed0b4a450bd4a4f5c06984
--- /dev/null
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+
+namespace paddle {
+namespace imperative {
+
+// Singleton implementation with C++ 11
+class AmpOperators {
+ public:
+  ~AmpOperators();
+  AmpOperators(const AmpOperators& o) = delete;
+  const AmpOperators& operator=(const AmpOperators& o) = delete;
+
+  static AmpOperators& Instance();
+
+  std::shared_ptr<std::unordered_set<std::string>> GetAllowOps();
+
+  std::shared_ptr<std::unordered_set<std::string>> GetBlockOps();
+
+ private:
+  AmpOperators();  // forbid calling default constructor
+
+  // The set of ops that support fp16 calculation and are considered numerically
+  // safe and performance critical. These ops are always converted to fp16.
+  std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
+
+  // The set of ops that support fp16 calculation and are considered numerically
+  // dangerous and whose effects may also be observed in downstream ops.
+  std::shared_ptr<std::unordered_set<std::string>> block_ops_;
+};
+
+// NOTE(zhiqiu): AutoCastGuard is used for RAII.
+class AutoCastGuard {
+ public:
+  AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
+      : tracer_(tracer) {
+    pre_mode_ = tracer_->IsAutoCastEnabled();
+    if (pre_mode_ != guard_mode) {
+      tracer_->SetEnableAutoCast(guard_mode);
+    }
+  }
+
+  ~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
+
+  // forbid copy and operator=
+  AutoCastGuard(const AutoCastGuard& guard) = delete;
+  AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
+
+ private:
+  std::shared_ptr<Tracer> tracer_;
+  bool pre_mode_;
+};
+
+NameVarBaseMap AutoCastInputs(const std::string& op_type,
+                              const NameVarBaseMap& ins);
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index f5fc5944709fc94ef23b878a5f58c9cb1dfed63a..7caeb4378ce3d1ca1d1557054642c9fa184bea39 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
     blas.AXPY(numel_, 1., x_, y_);
   }
 
+  void operator()(const platform::XPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
 #ifdef PADDLE_WITH_CUDA
   void operator()(const platform::CUDAPlace& place) {
     platform::CUDADeviceContext* ctx =
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 65ac570bc7aa07a1a06e9deffcf797d6ef5d2519..fcd4545a2c82d3c64f8d8d8683438aaf0e6a2719 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -16,7 +16,9 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -32,8 +34,12 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
  public:
   DygraphInferShapeContext(const NameVarMap<VarType>* in,
                            const NameVarMap<VarType>* out,
-                           const framework::AttributeMap* attr)
-      : var_base_map_in_(in), var_base_map_out_(out), attrs_(attr) {}
+                           const framework::AttributeMap* attr,
+                           const std::string op_type)
+      : var_base_map_in_(in),
+        var_base_map_out_(out),
+        attrs_(attr),
+        op_type_(op_type) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -135,6 +141,28 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
     return vec_res;
   }
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_type_, idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_type_, idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
 
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
@@ -367,6 +395,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   const NameVarMap<VarType>* var_base_map_in_;
   const NameVarMap<VarType>* var_base_map_out_;
   const framework::AttributeMap* attrs_;
+  const std::string op_type_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 17a0b5e0431d44176b896fa1b5df4f88cadafe9f..03e83301d44a35dc98e9a1aee0e1b22ef2380d50 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -186,6 +186,8 @@ class VarBase {
 
   framework::proto::VarType::Type DataType() const { return var_->DataType(); }
 
+  const platform::Place Place() const { return var_->Place(); }
+
   void ClearGradient();
 
   std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 0b45c189dd714adedc1fb1600e2b350c3dedb62b..4f133bf80c7904d9b6a84c933d431c2820b999e4 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -887,7 +887,10 @@ void PartialGradTask::RunEachOp(OpBase *op) {
                                              op->Attrs(), op->place());
     PADDLE_ENFORCE_NOT_NULL(
         double_grad_node,
-        platform::errors::NotFound("The Op %s doesn't have any grad op.",
+        platform::errors::NotFound("The Op %s doesn't have any grad op. If you "
+                                   "don't intend calculating higher order "
+                                   "derivatives, please set `create_graph` to "
+                                   "False.",
                                    op->Type()));
     VLOG(10) << "Create " << double_grad_node->size()
              << " double grad op(s) for " << op->Type()
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index def5c860449214ad4a08fd69ff575b91d6f162a0..4e0e95dd012976c292b4511e9707802c210dc599 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/prepared_operator.h"
+
 #include <sstream>
+
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
@@ -40,23 +42,17 @@ static void PrepareData(const platform::Place& place,
     for (const auto& var_base : name_pair.second) {
       const auto* tensor = GetTensorFromVar(var_base->Var());
       if (tensor && tensor->IsInitialized()) {
-        auto tmp_place = tensor->place();
-
-        // TODO(jiabin): Support transform data layout when we Verify it on more
-        // tests
-        if (!(tmp_place == place)) {
-          auto kernel_type_for_var = op.GetKernelTypeForVar(
-              name_pair.first, *tensor, expected_kernel_key);
-          if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-            continue;
-          } else {
-            VLOG(3) << "Transform Variable " << var_base->Name() << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            framework::Tensor out;
-            TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
-                          &out);
-            SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
-          }
+        auto kernel_type_for_var = op.GetKernelTypeForVar(
+            name_pair.first, *tensor, expected_kernel_key);
+        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+          continue;
+        } else {
+          VLOG(3) << "Transform Variable " << var_base->Name() << " from "
+                  << kernel_type_for_var << " to " << expected_kernel_key;
+          framework::Tensor out;
+          TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
+                        &out);
+          SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
         }
       }
     }
@@ -91,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
 
   framework::RuntimeContext ctx({}, {});
+#ifdef PADDLE_WITH_MKLDNN
+  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
+  // GetKernelType functions, so we need to copy the attributes there.
+  // Const qualifier of Attrs had to be discarded to overwrite it.
+  auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
+  mutable_op_attrs = attrs;
+#endif
   auto expected_kernel_key =
       op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
           op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
@@ -137,7 +147,8 @@ static void PreparedOpRunImpl(
   // TODO(zjl): remove scope in dygraph
   framework::Scope scope;
 
-  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs);
+  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
+                                                    op.Type());
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
index a231e16100b9f6b153beffe7c66de6fc6813414e..4a30ffb7e3d01ffa90a42278e2e5ef5271045d8a 100644
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -17,9 +17,11 @@
 //
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
@@ -384,7 +386,7 @@ TEST(test_layer, test_dygraph_infershape_context) {
   concat_att_map["axis"] = 1;
 
   DygraphInferShapeContext<imperative::VarBase> infer_shape_ctx(
-      &ins, &outs, &concat_att_map);
+      &ins, &outs, &concat_att_map, "dummy");
 
   bool have_x = infer_shape_ctx.HasOutputs("Out");
   ASSERT_EQ(have_x, true);
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index c2e30b45a7f6c06ee6eb8945922a4317e9060491..f226c63f0c432e3878c7df6a5a04433ce047ff26 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) {
 }
 #endif
 
-TEST(test_prepare_op, test_prepare_data_same_place) {
+void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
   std::shared_ptr<imperative::VarBase> vin(
       new imperative::VarBase(false, "vin"));
   std::shared_ptr<imperative::VarBase> vout(
@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
   var_pair out_pair = var_pair("Out", vb_vector(1, vout));
   imperative::NameVarBaseMap ins = {x_pair};
   imperative::NameVarBaseMap outs = {out_pair};
-  framework::AttributeMap attr_map;
   const std::string op_type = "relu";
   const auto& info = framework::OpInfoMap::Instance().Get(op_type);
   if (info.Checker()) info.Checker()->Check(&attr_map);
@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
     }
   }
 }
+
+TEST(test_prepare_op, test_prepare_data_same_place) {
+  TestPrepareDataSamePlace({});
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
+  TestPrepareDataSamePlace({{"use_mkldnn", true}});
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
 
 USE_OP(split);
 USE_OP(relu);
+#ifdef PADDLE_WITH_MKLDNN
+USE_OP_DEVICE_KERNEL(relu, MKLDNN);
+#endif
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index ee4c5617397b39d6847fecd1c884af8b0e14440f..d09cb03360363088bb021285af4574ffbbb81ef0 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -16,6 +16,7 @@
 #include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -53,8 +54,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
     attr_checker->Check(&attrs, true);
   }
 
+  NameVarBaseMap new_ins = ins;
+  if (enable_autocast_) {
+    VLOG(5) << "Auto mixed precision run operator: " << type;
+    new_ins = AutoCastInputs(type, ins);
+  }
+
   try {
-    OpBase::Run(*op, ins, outs, attrs, place);
+    OpBase::Run(*op, new_ins, outs, attrs, place);
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
     throw std::move(exception);
@@ -73,11 +80,11 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
 
   if (enable_program_desc_tracing_) {
     VLOG(5) << "Trace op " << type << " into ProgramDesc";
-    program_desc_tracer_->InsertOp(type, ins, outs, attrs);
+    program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
   }
 
-  if (ComputeRequiredGrad(ins, outs, trace_backward)) {
-    CreateGradOpNode(*op, ins, outs, attrs, place);
+  if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
+    CreateGradOpNode(*op, new_ins, outs, attrs, place);
   } else {
     VLOG(3) << "No Grad to track for Op: " << type;
   }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 7652b3aa291ac0063fcc411b5f86f6084f01e8ef..71996b3e1ac998be2c4cd3765591b640765089a0 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -97,6 +97,10 @@ class Tracer {
 
   void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
 
+  void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
+
+  bool IsAutoCastEnabled() const { return enable_autocast_; }
+
  private:
   std::unique_ptr<BasicEngine> basic_engine_;
   std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
@@ -104,6 +108,7 @@ class Tracer {
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   bool has_grad_{true};
+  bool enable_autocast_{false};
 };
 
 // To access static variable current_tracer
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 9c2ff39e8675fbe1ca3777731a5d9408bfc765b3..d730ddc12d1053910a36b8491c2ce983f60b3648 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -111,6 +111,28 @@ class VariableWrapper {
     }
   }
 
+  const platform::Place Place() const {
+    const framework::Tensor* tensor = nullptr;
+    auto place =
+        platform::CPUPlace();  // Default place for var not initialized.
+    if (var_.IsInitialized()) {
+      if (type_ == framework::proto::VarType::LOD_TENSOR) {
+        tensor = &(var_.Get<framework::LoDTensor>());
+      } else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
+        tensor = &(var_.Get<framework::SelectedRows>().value());
+      } else {
+        VLOG(6) << "Variable " << name_ << " is not initialized";
+        return place;
+      }
+    }
+    if (tensor && tensor->IsInitialized()) {
+      return tensor->place();
+    } else {
+      VLOG(6) << "The tensor of variable " << name_ << " is not initialized";
+      return place;
+    }
+  }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 1f2734eece578f7ec266a6f31cd46b373f010fc1..98554ed04976670c1a846cbeab69815417c0a998 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
     SRCS analyzer_tester.cc
     EXTRA_DEPS reset_tensor_array paddle_fluid_shared
     ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
-elseif(NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
-  inference_analysis_test(test_analyzer
-    SRCS analyzer_tester.cc
-    EXTRA_DEPS reset_tensor_array paddle_inference_api
-    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+elseif(WIN32)
+    inference_analysis_test(test_analyzer
+      SRCS analyzer_tester.cc
+      EXTRA_DEPS reset_tensor_array paddle_inference_api
+      ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
index 70adb4a974cc5f9911cb302840bbef7ec2591505..9a53ce53ab6a756af666de99c8729bf3da2e4a09 100644
--- a/paddle/fluid/inference/analysis/README.md
+++ b/paddle/fluid/inference/analysis/README.md
@@ -6,13 +6,13 @@ and make the various optimization features be pluggable and co-exist in a pipeli
 
 We borrowed some concepts from LLVM, such as
 
-- [Pass](./pass.h)es to implement optimization that traverse the inference program,
-- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
-- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+- [Pass](../../framework/ir/pass.h)es to implement optimization that traverse the inference program,
+- [Graph](../../framework/ir/graph.h) to represent the data flow graph built from a program,
+- [PassManager](./ir_pass_manager.h) to manage a sequence of `Pass`es over a graph.
 
 There are some other basic concepts here
 
-- [Node](./node.h), the node in a `DataFlowGraph`,
+- [Node](../../framework/ir/node.h), the node in a `Graph`,
   - `Function`, the Operator in Fluid,
   - `Value`, the Variable in Fluid;
 - [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
@@ -21,9 +21,9 @@ There are some other basic concepts here
 
 The `inference/analysis` module make all the passes in a pipeline, and works in such way:
 
-1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
-2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
-3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+1. Build a `Graph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `Graph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `Graph`.
 
 The new optimization features can be added as an independent `Pass` and controlled by gflags,
 each pass will generate unified debug information or visualization for better debugging.
@@ -54,5 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass.
 There is some helper legacy/function/class for analysis.
 
 - [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
-- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
-there are some implementations in  [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
+- [graph_traits.h](../../framework/ir/graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
+there are some implementations in  [graph_helper.cc](../../framework/ir/graph_helper.cc) , such as BFS and DFS..
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 30e8386f4c86e308372b5dd6328c7d3785a073b1..fb0ad31a3e612201de32813a65970c73b73b611b 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -54,8 +54,7 @@ if(WITH_TESTING)
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
     set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
-  elseif(NOT WIN32)
-    # TODO: Fix this unittest failed on Windows
+  elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
@@ -67,8 +66,7 @@ endif()
 if (NOT APPLE AND NOT WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
-elseif (NOT WIN32)
-  # TODO: Fix this unittest failed on Windows
+elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 0dddbf158722181664f3c982cde8b77145eece3c..9be12ff309acff681da75f7f13e317a408a9552a 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -50,8 +51,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
   using VariableNameMap = std::map<std::string, std::vector<std::string>>;
   std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
   for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
-    if (op->HasAttr("use_quantizer") &&
-        BOOST_GET_CONST(bool, op->GetAttr("use_quantizer"))) {
+    if (platform::HasOpINT8DataType(op)) {
       const VariableNameMap& connections_in = op->Inputs();
       const VariableNameMap& connections_out = op->Outputs();
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index fdd71b0d884004c84e2ee15eea522c64ff943dd9..1a3413657ce6fac41603d691dcdb61ddb1d6320a 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
     } else if (shape.size() == 3UL) {
       return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
     }
-    return nvinfer1::Dims4(shape[0], shape[1], 1, 1);
+    nvinfer1::Dims dims;
+    dims.nbDims = shape.size();
+    for (size_t i = 0; i < shape.size(); i++) {
+      dims.d[i] = shape[i];
+    }
+    return dims;
   }
 }
 }  // NOLINT
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 55a57caf9a0d6eb44399ceb8064b613afb955d47..971f99e69197226bb7d7b26135f0b667f8ebdf30 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -56,9 +56,11 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
   return static_cast<nvinfer1::IRuntime*>(
       dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
 }
-static nvinfer1::IPluginRegistry* getPluginRegistry() {
+#if IS_TRT_VERSION_GE(6000)
+static nvinfer1::IPluginRegistry* GetPluginRegistry() {
   return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+#endif
 
 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 70ead9720d2ebcb15ae0173dc0ba7c2095a4f4d4..f5d22b982de2b41474d97565a44dedc67c8a85d7 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller {
 #if IS_TRT_VERSION_GE(5130)
     teller_set.insert("relu6");
     teller_set.insert("hard_sigmoid");
+    int8_teller_set.insert("relu6");
+    int8_teller_set.insert("hard_sigmoid");
 #endif
 #if IS_TRT_VERSION_GE(6000)
     teller_set.insert("fused_embedding_eltwise_layernorm");
@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller {
                                                   "elementwise_add",
                                                   "leaky_relu",
                                                   "fc",
-                                                  "relu6",
                                                   "concat",
                                                   "scale",
                                                   "elementwise_mul",
-                                                  "conv2d_transpose"};
+                                                  "conv2d_transpose",
+                                                  "hard_swish"};
   std::unordered_set<std::string> teller_set{
       "mul",
       "conv2d",
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index e7f9381e97137d77d27b54cac910bfee9f629464..5e43be90de3dbbfef3c7d3def7e37904bb644380 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
   return ret;
 }
 
+template <typename T>
+void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
+  for (auto ptr : embs_gpu_) {
+    if (ptr) cudaFree(ptr);
+  }
+
+  if (bias_gpu_) cudaFree(bias_gpu_);
+  if (scale_gpu_) cudaFree(scale_gpu_);
+}
+
 template <typename T>
 bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
   int64_t *emb_ptr_gpu_d =
       emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
 
-  std::vector<int64_t> in_ptr, emb_ptr;
+  std::vector<uintptr_t> in_ptr, emb_ptr;
   for (int i = 0; i < input_num; i++) {
     in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
     emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 8ac611cd7c62fddfd4f01d7705b841abc28501d3..5babd87db0602e973452efa613fcaf9810d29afa 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new EmbEltwiseLayernormPluginDynamic(
+    auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
         eps_);
+    ptr->embs_gpu_ = embs_gpu_;
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
+    return ptr;
   }
 
   const char* getPluginType() const override {
@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     return sum_num;
   }
 
+  void terminate() override;
   void serialize(void* buffer) const override {
     // SerializeValue(&buffer, with_fp16_);
     SerializeValue(&buffer, emb_sizes_);
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index f1e11b6fba1f1556e2a8a2aaaca1223aaef76b03..860f1039d5e10290d84d1761bc7337e49fa210eb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
 
 #if IS_TRT_VERSION_GE(6000)
 
+void PReluPluginDynamic::terminate() {
+  if (p_gpu_weight_) {
+    cudaFree(p_gpu_weight_);
+  }
+}
+
 int PReluPluginDynamic::initialize() {
   cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
   cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index 4756ca2e0225795edc3bd3112b21e3b628ad5c0b..3126366c5fdd8bb69a78cea11f5778c45de738ec 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
   }
   ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    ptr->p_gpu_weight_ = p_gpu_weight_;
+    return ptr;
   }
 
   const char* getPluginType() const override { return "prelu_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
+  void terminate() override;
 
   size_t getSerializationSize() const override;
   void serialize(void* buffer) const override;
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 8fe1edc4bf0321b054322a27f0c16819bc023ed8..24cd8e0368182ae597e48765bc0167ca1eca6bd3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
-    return new SkipLayerNormPluginDynamic(
+    auto ptr = new SkipLayerNormPluginDynamic(
         bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = bias_gpu_;
+    return ptr;
   }
 
   const char* getPluginType() const override { return "skip_layernorm_plugin"; }
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index f4424b8b7851fbf41611d4048a4981982179200f..528adacb27c9897420a5115a93c88c246c0d78d8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -178,12 +178,16 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
   std::string name_space_;
   std::string plugin_base_;
 };
-#endif
 
 template <typename T>
 class TrtPluginRegistrarV2 {
  public:
-  TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
+  TrtPluginRegistrarV2() {
+    static auto func_ptr = GetPluginRegistry();
+    if (func_ptr != nullptr) {
+      func_ptr->registerCreator(creator, "");
+    }
+  }
 
  private:
   T creator;
@@ -193,6 +197,8 @@ class TrtPluginRegistrarV2 {
   static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
       plugin_registrar_##name {}
 
+#endif
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 6bc7728487cc9b00e2a20280c5579483b806bc10..1b79c77c69e162a6f96a1762a4949386a7dadde4 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -20,6 +20,12 @@ function(download_int8_data install_dir data_file)
     endif()
 endfunction()
 
+function(download_GRU_data install_dir data_file)
+    if (NOT EXISTS ${install_dir}/${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file})
+    endif()
+endfunction()
+
 function(download_quant_data install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
@@ -97,6 +103,18 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
              --iterations=2)
 endfunction()
 
+function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary infer_model data_path)
+    inference_analysis_test_run(${TARGET_NAME}
+    COMMAND ${test_binary}
+        ARGS --infer_model=${infer_model}
+             --infer_data=${data_path}
+             --batch_size=50
+             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+             --with_accuracy_layer=true
+             --use_analysis=true
+             --iterations=2)
+endfunction()
+
 function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
 	py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
 	        ARGS --data_dir=${data_dir}
@@ -114,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -173,6 +192,7 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
+set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150) 
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -315,6 +335,20 @@ if(WITH_MKLDNN)
   download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
   inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
 
+  ### Lexcial analysis GRU model
+  set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
+  set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
+  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
+  set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
+  set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
+
+  # build test binary to be used in subsequent tests
+  inference_analysis_api_test_build(${LEXICAL_TEST_APP} ${LEXICAL_TEST_APP_SRC})
+  # run lexcial analysis test
+  inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
+
   ### optimized FP32 vs. Quant INT8 tests
   
   set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
@@ -439,19 +473,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
-    inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
-    set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
-    endif()
-
-    inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
-
 endif()
 
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4035c803413794f8b3da8026d373aa8847054f3
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -0,0 +1,225 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+// setting iterations to 0 means processing the whole dataset
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void SetNativeConfig(AnalysisConfig *cfg,
+                     const int &num_threads = FLAGS_cpu_num_threads) {
+  cfg->SwitchIrOptim(false);
+  cfg->DisableGpu();
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->SetCpuMathLibraryNumThreads(num_threads);
+}
+
+void SetAnalysisConfig(AnalysisConfig *cfg,
+                       const int &num_threads = FLAGS_cpu_num_threads) {
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim(true);
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(num_threads);
+  cfg->EnableMKLDNN();
+}
+
+std::vector<size_t> ReadSentenceLod(std::ifstream &file, size_t offset,
+                                    int64_t total_sentences_num) {
+  std::vector<size_t> sentence_lod(total_sentences_num);
+
+  file.clear();
+  file.seekg(offset);
+  file.read(reinterpret_cast<char *>(sentence_lod.data()),
+            total_sentences_num * sizeof(size_t));
+
+  if (file.eof()) LOG(ERROR) << "Reached end of stream";
+  if (file.fail()) throw std::runtime_error("Failed reading file.");
+  return sentence_lod;
+}
+
+template <typename T>
+class TensorReader {
+ public:
+  TensorReader(std::ifstream &file, size_t beginning_offset, std::string name)
+      : file_(file), position_(beginning_offset), name_(name) {}
+
+  PaddleTensor NextBatch(std::vector<int> shape, std::vector<size_t> lod) {
+    int numel =
+        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    PaddleTensor tensor;
+    tensor.name = name_;
+    tensor.shape = shape;
+    tensor.dtype = GetPaddleDType<T>();
+    tensor.data.Resize(numel * sizeof(T));
+    if (lod.empty() == false) {
+      tensor.lod.clear();
+      tensor.lod.push_back(lod);
+    }
+    file_.seekg(position_);
+    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
+    if (file_.fail())
+      throw std::runtime_error(name_ + ": failed reading file.");
+    file_.read(reinterpret_cast<char *>(tensor.data.data()), numel * sizeof(T));
+    position_ = file_.tellg();
+    return tensor;
+  }
+
+ protected:
+  std::ifstream &file_;
+  size_t position_;
+  std::string name_;
+};
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              int32_t batch_size = FLAGS_batch_size) {
+  std::ifstream file(FLAGS_infer_data, std::ios::binary);
+  if (!file) {
+    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
+  }
+
+  int64_t total_sentences_num = 0L;
+  int64_t total_words_num = 0L;
+  file.seekg(0);
+  file.read(reinterpret_cast<char *>(&total_sentences_num), sizeof(int64_t));
+  LOG(INFO) << "Total sentences in file: " << total_sentences_num;
+  file.read(reinterpret_cast<char *>(&total_words_num), sizeof(int64_t));
+  LOG(INFO) << "Total words in file: " << total_words_num;
+  size_t lods_beginning_offset = static_cast<size_t>(file.tellg());
+  auto words_begining_offset =
+      lods_beginning_offset + sizeof(size_t) * total_sentences_num;
+  auto targets_beginning_offset =
+      words_begining_offset + sizeof(int64_t) * total_words_num;
+
+  std::vector<size_t> lod_full =
+      ReadSentenceLod(file, lods_beginning_offset, total_sentences_num);
+
+  size_t lods_sum = std::accumulate(lod_full.begin(), lod_full.end(), 0UL);
+  EXPECT_EQ(lods_sum, static_cast<size_t>(total_words_num));
+
+  TensorReader<int64_t> words_reader(file, words_begining_offset, "words");
+  TensorReader<int64_t> targets_reader(file, targets_beginning_offset,
+                                       "targets");
+  // If FLAGS_iterations is set to 0, run all batches
+  auto iterations_max = total_sentences_num / batch_size;
+  auto iterations = iterations_max;
+  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
+    iterations = FLAGS_iterations;
+  }
+
+  for (auto i = 0; i < iterations; i++) {
+    // Calculate the words num.  Shape=[words_num, 1]
+    std::vector<size_t> batch_lod = {0};
+    size_t num_words = 0L;
+    std::transform(lod_full.begin() + i * FLAGS_batch_size,
+                   lod_full.begin() + (i + 1) * FLAGS_batch_size,
+                   std::back_inserter(batch_lod),
+                   [&num_words](const size_t lodtemp) -> size_t {
+                     num_words += lodtemp;
+                     return num_words;
+                   });
+    auto words_tensor = words_reader.NextBatch(
+        {static_cast<int>(batch_lod[FLAGS_batch_size]), 1}, batch_lod);
+    if (FLAGS_with_accuracy_layer) {
+      auto targets_tensor = targets_reader.NextBatch(
+          {static_cast<int>(batch_lod[FLAGS_batch_size]), 1}, batch_lod);
+      inputs->emplace_back(std::vector<PaddleTensor>{
+          std::move(words_tensor), std::move(targets_tensor)});
+    } else {
+      inputs->emplace_back(std::vector<PaddleTensor>{std::move(words_tensor)});
+    }
+  }
+}
+
+std::vector<double> Lexical_Test(
+    const std::vector<std::vector<PaddleTensor>> &input_slots_all,
+    std::vector<std::vector<PaddleTensor>> *outputs, AnalysisConfig *config,
+    const bool use_analysis) {
+  TestOneThreadPrediction(
+      reinterpret_cast<const PaddlePredictor::Config *>(config),
+      input_slots_all, outputs, FLAGS_use_analysis);
+  std::vector<double> acc_res(3);
+  if (FLAGS_with_accuracy_layer) {
+    EXPECT_GT(outputs->size(), 0UL);
+    EXPECT_EQ(3UL, (*outputs)[0].size());
+    std::vector<int64_t> acc_sum(3);
+    for (size_t i = 0; i < outputs->size(); i++) {
+      for (size_t j = 0; j < 3UL; j++) {
+        acc_sum[j] =
+            acc_sum[j] + *static_cast<int64_t *>((*outputs)[i][j].data.data());
+      }
+    }
+    // nums_infer, nums_label, nums_correct
+    auto precision =
+        acc_sum[0]
+            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[0])
+            : 0;
+    auto recall =
+        acc_sum[1]
+            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[1])
+            : 0;
+    auto f1_score =
+        acc_sum[2]
+            ? static_cast<float>(2 * precision * recall) / (precision + recall)
+            : 0;
+
+    LOG(INFO) << "Precision:  " << std::fixed << std::setw(6)
+              << std::setprecision(5) << precision;
+    LOG(INFO) << "Recall:  " << std::fixed << std::setw(6)
+              << std::setprecision(5) << recall;
+    LOG(INFO) << "F1 score: " << std::fixed << std::setw(6)
+              << std::setprecision(5) << f1_score;
+
+    acc_res = {precision, recall, f1_score};
+    // return acc_res;
+  } else {
+    EXPECT_GT(outputs->size(), 0UL);
+    EXPECT_EQ(outputs[0].size(), 1UL);
+    LOG(INFO) << "No accuracy result. To get accuracy result provide a model "
+                 "with accuracy layers in it and use --with_accuracy_layer "
+                 "option.";
+  }
+  return acc_res;
+}
+
+TEST(Analyzer_lexical_test, Analyzer_lexical_analysis) {
+  AnalysisConfig native_cfg;
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  SetNativeConfig(&native_cfg, FLAGS_cpu_num_threads);
+  std::vector<double> acc_ref(3);
+  acc_ref = Lexical_Test(input_slots_all, &outputs, &native_cfg, false);
+  if (FLAGS_use_analysis) {
+    AnalysisConfig analysis_cfg;
+    SetAnalysisConfig(&analysis_cfg, FLAGS_cpu_num_threads);
+    analysis_cfg.pass_builder()->AppendPass("mkldnn_placement_pass");
+    std::vector<double> acc_analysis(3);
+    acc_analysis = Lexical_Test(input_slots_all, &outputs, &analysis_cfg, true);
+    for (size_t i = 0; i < acc_analysis.size(); i++) {
+      CHECK_LE(std::abs(acc_ref[i] - acc_analysis[i]),
+               FLAGS_quantized_accuracy);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
index 1fc35f86bcbc5b595f43569f14e1d2e2bc63c3ad..51870e804144a002b15750ee8c8fa9f4c0af40dc 100644
--- a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
@@ -18,7 +18,7 @@ For reference, please examine the code of unit test enclosed in [analyzer_int8_i
 
 * ### Create Analysis config
 
-INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
+INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/inference_deployment/inference/native_infer_en.html#a-name-use-analysisconfig-to-manage-inference-configurations-use-analysisconfig-to-manage-inference-configurations-a)
 
 * ### Create quantize config by analysis config
 
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 0aea47ae7fab1be3bafe35af575e9a2bea2d8420..5840a4c42b3b1065410dc1509cf0cee2480bd596 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -66,7 +66,7 @@ TEST(AnalysisPredictor, use_gpu) {
   float* data_o = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); j += 10) {
     EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
-                10e-5);
+                12e-5);
   }
 }
 
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 6526b87436557b7f0c5c6dc5d3b59f2d70323d84..7e5dfa2424dbca4fb3a8a08e3d7fa7fbc3060d3d 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -123,8 +123,11 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
   config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                 opt_input_shape);
+  AnalysisConfig* config_deser = new AnalysisConfig(config);
+
   std::vector<float> out_data;
-  run(config, &out_data);
+  run(config, &out_data);         // serialize
+  run(*config_deser, &out_data);  // deserialize
   for (size_t i = 0; i < out_data.size(); i++) {
     EXPECT_NEAR(result[i], out_data[i], 1e-6);
   }
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index babe9977cd571f588f0bdc5a6723d4b05afab72b..c99ebcdcb5f319f73b7fd931d13f27684db39cad 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   std::vector<float> out_data;
   run(config, &out_data);
   for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+    EXPECT_NEAR(result[i], out_data[i], 1e-5);
   }
 }
 
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 72816f6d0317600ad6bf8ffc4ad31bd1a23d7c30..b9f979f96d4b106642795151fb8e34b025b2caef 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       ${EXTERNAL_PROJECT_NAME}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+      URL                   ${URL}/${FILENAME}
       DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
   )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index bd1908ac65509343530aa57489661637eed72595..9cc7c267454a4dbd4e1f62ec971e4160d6088913 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+elseif(WITH_XPU)
+    set(AllocatorFacadeDeps xpu_info)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 2ab0d69ef806155adbff83e523a1242e51c2c7fc..3213684c140b02e1fa4b846cb0448f9bc9d8f3ee 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,6 +39,9 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
 
 DEFINE_int64(
     gpu_allocator_retry_time, 10000,
@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
     switch (strategy) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kThreadLocal: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
  private:
   void InitSystemAllocators() {
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #ifdef PADDLE_WITH_CUDA
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
   class ZeroSizeAllocator : public Allocator {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
     }
     places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::XPUPlace(dev_id));
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 907a266e7b2bcd30e65ca71ab3dbae7f9b110b3b..92e3933a072832fa42520e67f455d3dc90118518 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,6 +29,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+template <>
+void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void *p = nullptr;
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (FLAGS_init_allocated_mem) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+  }
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  VLOG(10) << "  pointer=" << p;
+  return p;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+  return nullptr;
+#endif
+}
+
+template <>
+void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_free(p);
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
+template <>
+size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  printf("Used func return 0 for XPUPlace\n");
+  return 0;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
  private:
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b19f02db1c0ddf17c84536bf5d512bbd823909b2..225b6858cc1f2a5afc9d612958694d0d940e2e7b 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace memory {
 
@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
+#ifdef PADDLE_WITH_XPU
+template <>
+void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device || dev_id != dst_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    void* tmp = malloc(num);
+    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    free(tmp);
+  } else {
+    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 012b16a6a05f3d5fec3636b0a790d4d67334295f..6e8ff52ed4a8846f5f6060e10cfd9bec22308e9e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,12 +86,16 @@ if (WITH_DGC)
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
 endif()
 
+cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
+lod_tensor maxouting unpooling pooling lod_rank_table context_project
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
@@ -111,6 +115,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
 set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 
+cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax)
 cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
@@ -118,7 +123,7 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
 if (WITH_GPU)
     nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
 else()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index b9a92c2207d8e9b86cc95be8285ce6b2e6db597b..63b3b0f1a3408154a2d1c8aff76a85a95ad044f6 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+
 #include <memory>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
+
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -199,7 +202,7 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.
 
-.. math:: out=\sqrt x=x^{1/2}
+.. math:: out=\\sqrt{x}=x^{1/2}
 
 **Note**:
   input value must be greater than or equal to zero.
@@ -211,12 +214,12 @@ Rsqrt Activation Operator.
 
 Please make sure input is legal in case of numeric errors.
 
-$$out = \frac{1}{\sqrt{x}}$$
+$$out = \\frac{1}{\\sqrt{x}}$$
 
 )DOC";
 
 UNUSED constexpr char AbsDoc[] = R"DOC(
-Abs Activation Operator.
+Abs Operator.
 
 $$out = |x|$$
 
@@ -239,6 +242,9 @@ $$out = \\left \\lfloor x \\right \\rfloor$$
 UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Operator. Computes cosine of x element-wise.
 
+Input range is `(-inf, inf)` and output range is `[-1,1]`.
+Return `nan` if input is out of boundary.
+
 $$out = cos(x)$$
 
 )DOC";
@@ -311,13 +317,6 @@ $$out = x^2$$
 
 )DOC";
 
-UNUSED constexpr char SoftplusDoc[] = R"DOC(
-Softplus Activation Operator.
-
-$$out = \ln(1 + e^{x})$$
-
-)DOC";
-
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
 
@@ -331,7 +330,7 @@ class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of acos operator");
     AddOutput("Out", "Output of acos operator");
     AddComment(R"DOC(
-Arccosine Activation Operator.
+Arccosine Operator.
 
 $$out = \cos^{-1}(x)$$
 
@@ -345,7 +344,7 @@ class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of asin operator");
     AddOutput("Out", "Output of asin operator");
     AddComment(R"DOC(
-Arcsine Activation Operator.
+Arcsine Operator.
 
 $$out = \sin^{-1}(x)$$
 
@@ -359,9 +358,9 @@ class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of atan operator");
     AddOutput("Out", "Output of atan operator");
     AddComment(R"DOC(
-Arctanh Activation Operator.
+Arctangent Operator.
 
-$$out = \tanh^{-1}(x)$$
+$$out = \tan^{-1}(x)$$
 
 )DOC");
   }
@@ -390,6 +389,36 @@ $$out = \max(x, \alpha * x)$$
   }
 };
 
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "Input of Softplus operator, an N-D Tensor, with data type "
+             "float32, float64 or float16.");
+    AddOutput(
+        "Out",
+        "Output of Softplus operator, a Tensor with shape same as input.");
+    AddAttr<float>("beta", "The value of beta for Softplus.").SetDefault(1.0f);
+    AddAttr<float>("threshold", "The value of threshold for Softplus.")
+        .SetDefault(20.0f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+:strong:`Softplus Activation Operator`
+
+..  math::
+    out = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) \\
+    \text{For numerical stability, the implementation reverts to the linear function when :}\,x \times \beta > threshold.
+
+)DOC");
+  }
+};
+
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -504,6 +533,9 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("threshold",
                    "The threshold value of Relu6. Default is 6.0. ")
         .SetDefault(6.0f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Relu6 Activation Operator.
 
@@ -663,7 +695,6 @@ REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc);
 REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
-REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
 template <ActBwdOpFwdDeps kDepValue>
@@ -750,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
   }
 };
 
-// leaky_relu Grad: dx=dy if y>=0 else alpha * dy
-// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx
+// leaky_relu Grad: dx=dy if x>=0 else alpha * dy
+// leaky_relu GradGrad: ddy=ddx if x>=0 else alpha * ddx
 template <typename T>
 class LeakyReluDoubleGradMaker
     : public ::paddle::framework::SingleGradOpMaker<T> {
@@ -761,8 +792,8 @@ class LeakyReluDoubleGradMaker
  protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("leaky_relu_grad_grad");
-    // input1: Out
-    op->SetInput("Out", this->Input("Out"));
+    // input1: X
+    op->SetInput("X", this->Input("X"));
     // X@GRAD@GRAD: ddx
     op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
     op->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 3aac7ae8a5e8a9e889242b59f42a29af08ad1c46..00a7c063c9155488d117332d5ef3541d16d76bdb 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -388,9 +388,9 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   }
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    out.device(d) = x * (temp1 + temp2);
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
   }
 };
 
@@ -405,9 +405,9 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>();
-    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -975,32 +975,46 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
-// softplus(x) = log(1 + exp(x))
-// When x is a very large positive number, exp(x) may explode to inf,
-// Using trick below for numerical stability
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+// For numerical stability, using the following formula instead of softplus(x) =
+// log(1 + exp(x))
+// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
+// 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
-    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+    auto x_beta = static_cast<T>(beta) * x;
+    out.device(d) = (x_beta > static_cast<T>(threshold))
+                        .select(x, (static_cast<T>(1) + x_beta.exp()).log() /
+                                       static_cast<T>(beta));
   }
 };
 
-// d(softplus(x))/dx = exp(x) / (1 + exp(x))
-// For numerical stability:
-// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
-// exp(x - max(x, 0)))
+// For numerical stability, using the following formula instead of
+// d(softplus(x))/dx = 1 / (1 + exp(-x))
+// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
+// = 1, threshold = 20 by default), otherwise x
 template <typename T>
 struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) {
-    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    auto x_beta = static_cast<T>(beta) * x;
     dx.device(d) =
-        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+        (x_beta > static_cast<T>(threshold))
+            .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1070,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
 
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
   }
 };
 
@@ -1084,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 =
-        static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>();
-    auto temp2 = (out > static_cast<T>(0)).template cast<T>();
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1116,9 +1134,20 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * static_cast<T>(alpha) * x.exp() *
-                       (x <= static_cast<T>(0)).template cast<T>();
+    auto temp_a_pos = static_cast<T>(alpha > 0);
+    auto temp_a_neg = static_cast<T>(alpha <= 0);
+    auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
+    auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
+
+    // dx = dout, if alpha > 0 and x > 0
+    // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
+    // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
+    // dx = 0, if alpha <= 0 and x <=0
+    dx.device(d) =
+        dout * temp_a_pos * temp_x_pos +
+        dout * static_cast<T>(alpha) * x.exp() * temp_a_pos * temp_x_neg +
+        dout * (static_cast<T>(1) + static_cast<T>(alpha) * x.exp()) *
+            temp_a_neg * temp_x_pos;
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1437,18 +1466,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
       auto* d = dev.eigen_device();
       auto ddx = framework::EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto out = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad"));
+      auto x = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
       auto ddout = framework::EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((out > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) *
-                              (out <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index f7cc513b234e6e440507af28189ac236b71f9d15..d1a3695015abdb9ce13c4f807d1abacdf0af024d 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,10 +28,15 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 struct Linspace<paddle::platform::CPUDeviceContext, T> {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                   const framework::ExecutionContext& ctx) {
     T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
     T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
     for (int i = 0; i < count; ++i) {
       number_data[i] = start + (T)i * slice;
     }
@@ -130,6 +135,10 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         "use_cudnn",
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
         .SetDefault(true);
+    AddAttr<bool>("align_corners",
+                  "(bool, default false) Whether to align the corners of input"
+                  "and ouput.")
+        .SetDefault(true);
     AddAttr<std::vector<int>>(
         "output_shape",
         "The target output image shape with format [N, C, H, W].")
@@ -164,10 +173,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]]]
-        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in
+        width axis.
     
     Step2:
-        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last
+        dimension. The we get:
         C_ = [[-1.  -1.   1. ]
               [-0.5 -1.   1. ]
               [ 0.  -1.   1. ]
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7aaaa0002c5ab31af72c75e69f5a283c09633ba4
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -0,0 +1,211 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+struct Linspace<paddle::platform::CUDADeviceContext, T> {
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
+                  const framework::ExecutionContext& ctx) {
+    T* number_data = numbers->mutable_data<T>({count}, ctx.GetPlace());
+    T slice = (end - start) / (T)(count - 1);
+    if (!align_corners) {
+      slice = (end - start) / (T)count;
+      start *= (T)(count - 1) / (T)count;
+    }
+    auto stream = ctx.cuda_device_context().stream();
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
+                                                  number_data);
+  }
+};
+
+template <typename T>
+__global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
+                                   T h_start, T w_start, T h_step, T w_step,
+                                   const T* theta,  // N, 2, 3
+                                   T* output) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+
+    int theta_offset = n * 6;  // 2 * 3;
+    // affine from (h_coor, w_coor) to (x, y)
+    output[index * 2] = theta[theta_offset] * h_coor +
+                        theta[theta_offset + 1] * w_coor +
+                        theta[theta_offset + 2];
+    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
+                            theta[theta_offset + 4] * w_coor +
+                            theta[theta_offset + 5];
+  }
+}
+
+template <typename T>
+__global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
+                                        int out_w, T h_start, T w_start,
+                                        T h_step, T w_step,
+                                        const T* out_grad,  // N, H, W, 2
+                                        T* theta_grad) {    // N, 2, 3
+  CUDA_KERNEL_LOOP(index, count) {
+    int w = index % out_w;
+    int h = (index / out_w) % out_h;
+    int n = index / (out_w * out_h);
+    T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
+    T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
+
+    int theta_offset = n * 6;  // 2 * 3;
+    T out_grad_x = out_grad[index * 2];
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
+
+    T out_grad_y = out_grad[index * 2 + 1];
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
+  }
+}
+
+template <typename T>
+class AffineGridOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    auto* output = ctx.Output<Tensor>("Output");
+    T* out_data = output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+
+    const int count = n * h * w;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        theta->data<T>(),  // N, 2, 3
+        out_data);
+  }
+};
+
+template <typename T>
+class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+    T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
+
+    T h_step;
+    T w_step;
+    T h_start = -1;
+    T w_start = -1;
+    if (align_corners) {
+      h_step = static_cast<T>(2) / static_cast<T>(h - 1);
+      w_step = static_cast<T>(2) / static_cast<T>(w - 1);
+    } else {
+      h_step = static_cast<T>(2) / static_cast<T>(h);
+      w_step = static_cast<T>(2) / static_cast<T>(w);
+
+      h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
+      w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
+    }
+    const int count = n * h * w;
+    VLOG(3) << "count: " << count << "; h_step: " << h_step
+            << "; w_step: " << w_step << "; h_start: " << h_start
+            << "; w_start: " << w_start;
+    int block = 512;
+    int grid = (count + block - 1) / block;
+    auto cu_stream = ctx.cuda_device_context().stream();
+    affine_grid_grad_kernel<<<grid, block, 0, cu_stream>>>(
+        count, n, h, w, h_start, w_start, h_step, w_step,
+        output_grad->data<T>(), theta_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(affine_grid, ops::AffineGridOpCUDAKernel<float>,
+                        ops::AffineGridOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(affine_grid_grad,
+                        ops::AffineGridGradOpCUDAKernel<float>,
+                        ops::AffineGridGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 73df8a38b96c30196a7e39d2cf1e348f2a7722ec..50c9ebcd9c8f52077d7f5d0abb10c631cbeee794 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -37,12 +37,13 @@ using Array4 = Eigen::DSizes<int64_t, 4>;
  */
 template <typename DeviceContext, typename T>
 struct Linspace {
-  void operator()(T start, T end, int count, framework::Tensor* numbers,
+  void operator()(T start, T end, int count, bool align_corners,
+                  framework::Tensor* numbers,
                   const framework::ExecutionContext& ctx);
 };
 
 template <typename DeviceContext, typename T>
-inline void GetIdxMap(int n, int h, int w, Tensor* grid,
+inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
                       const framework::ExecutionContext& ctx) {
   auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
   grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
@@ -50,16 +51,19 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
   // Get indexes of height with shape [height, width, 1]
   Tensor h_idx;
   Linspace<DeviceContext, T> linspace;
-  linspace((T)-1, (T)1, h, &h_idx, ctx);
+  linspace((T)-1, (T)1, h, align_corners, &h_idx, ctx);
   auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
   // Get indexes of width with shape [height, width, 1]
   Tensor w_idx;
-  linspace((T)-1, (T)1, w, &w_idx, ctx);
+  linspace((T)-1, (T)1, w, align_corners, &w_idx, ctx);
   auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
   // Get constant ones tensor with shape [height, width, 1]
   Tensor ones;
   ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+
+  math::SetConstant<DeviceContext, T>()(
+      ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
+  auto ones_t = EigenTensor<T, 3>::From(ones);
   // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
   // ones
   Tensor w_idx_map;
@@ -74,11 +78,9 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
   Tensor w_h_one_idx_map;
   w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
   auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
-
   w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
                                   .broadcast(Array2(h, 1))
                                   .reshape(Array3(h, w, 1));
-
   h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
                                   .broadcast(Array2(w, 1))
                                   .shuffle(Array2(1, 0))
@@ -97,6 +99,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     auto* theta = ctx.Input<Tensor>("Theta");
     int n = theta->dims()[0];
     auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
     int h = 0;
     int w = 0;
     if (size_attr.size() == 0) {
@@ -116,7 +119,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
     Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
     // output = grid * theta.T
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
@@ -140,6 +143,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
     auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
     int n = output_grad->dims()[0];
     auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    auto align_corners = ctx.Attr<bool>("align_corners");
     int h = 0;
     int w = 0;
     if (size_attr.size() == 0) {
@@ -158,7 +162,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<DeviceContext>(), theta_grad,
         static_cast<T>(0));
     Tensor grid;
-    GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx);
+    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
     // output = grid * theta.T
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 911757007266c9ff88b0e348d350909ce0ff0bce..736483c3304ac32491de4fd98879fbfef04f7110 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -22,9 +22,11 @@ namespace operators {
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Input", "The first input tensor to compare.");
-    AddInput("Other", "The second input tensor to compare.");
-    AddOutput("Out", "The output tensor of allclose op.");
+    AddInput("Input",
+             "The input tensor, it's data type should be float32, float64.");
+    AddInput("Other",
+             "The input tensor, it's data type should be float32, float64.");
+    AddOutput("Out", "The output tensor, it's data type is bool.");
 
     AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
         .SetDefault(1e-5);
@@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
 
     AddComment(R"DOC( 
-This operator checks if all :math:`input` and :math:`other` satisfy the condition:
+This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
-:math:`\left| input - other \right| \leq atol + rtol \times \left| other \right|`
+.. math::
+    \left| x - y \right| \leq atol + rtol \times \left| y \right|
 
-elementwise, for all elements of :math:`input` and :math:`other`. The behaviour of this
+elementwise, for all elements of :math:`x` and :math:`y`. The behaviour of this
 operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if
 two tensors are elementwise equal within a tolerance.
 )DOC");
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
index 01b6ccedcdd8156269d445d7822a4184c062b225..7f0ca1493f712f7f4809a56bf6a23f8757f94c2d 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
+
 #include <string>
 #include <vector>
 
@@ -67,7 +68,7 @@ class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
               "amp_check_finite_and_unscale operator.")
         .AsDuplicable();
     AddOutput("FoundInfinite",
-              "(Tensor) 1-dim tensor, contains a int scalar, which indicates "
+              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
               "if there there is infinite or nan item in input X.");
     AddComment(R"DOC(
 amp_check_finite_and_scale operator.
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
index b92c6901d71bd80c45b0681f62a1a2ddedfcf64a..ee00d7c5f4499867c2c706ddcf314c1bfae0a866 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda.h>
+
 #include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -21,7 +22,7 @@ namespace operators {
 
 template <typename T>
 __global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
-                                       int* found_inf, T* out) {
+                                       bool* found_inf, T* out) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (idx < num) {
@@ -44,7 +45,7 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
     auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
 
     const T* scale_data = scale->data<T>();
-    int* found_inf_data = found_inf->mutable_data<int>(dev_ctx.GetPlace());
+    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
     cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
 
     for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index 85e4f98173511435a52b32e506afc8d5b772f74f..14708c4df10f5160d0e72e7669e0015554d8215f 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -1,29 +1,22 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..73581dac4e419ca9c970db4414ff54d4cbd3fd70
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __NVCC__
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+using Tensor = framework::Tensor;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer, const T init, const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
+                    Tensor* indices, const int64_t pre, const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256)
+      return 512;
+    else if (col > 128)
+      return 256;
+    else if (col > 64)
+      return 128;
+    else if (col > 32)
+      return 64;
+    else if (col > 16)
+      return 32;
+    else if (col > 8)
+      return 16;
+    else
+      return 8;
+  };
+
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
+              in_data, out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T, IndType, Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, Reducer(), std::numeric_limits<T>::max(),
+              in_data, out_data));
+    }
+  }
+}
+
+template <typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const framework::ExecutionContext& ctx;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {}
+  template <typename IndType>
+  void apply() const {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int axis = ctx.Attr<int64_t>("axis");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    framework::DDim input_dims;
+    if (flatten) {
+      input_dims = framework::make_ddim({input->numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      input_dims = input->dims();
+      if (axis < 0) axis += input->dims().size();
+    }
+
+    int64_t numel = input->numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+
+    const auto& dev_ctx = ctx.cuda_device_context();
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
+  }
+};
+template <typename T, class Reducer>
+class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dtype = ctx.Attr<int>("dtype");
+    if (dtype < 0) {
+      framework::VisitDataType(static_cast<framework::proto::VarType::Type>(
+                                   framework::proto::VarType::INT64),
+                               VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+      return;
+    }
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(dtype),
+        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
+  }
+};
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 0fc7b47c62ea9d7da805b797fcf5e4db4e39328d..ae3637f6f99783d70bd57a3935a979b0387692de 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {};
   struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
                           enum_argminmax_value> {                             \
     void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, int64_t axis, bool keepdims) { \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+                    framework::LoDTensor* out, framework::DDim x_dims,        \
+                    int64_t axis, bool keepdims) {                            \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
       if (keepdims) {                                                         \
         auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
         out_eigen.device(*(ctx.eigen_device())) =                             \
@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor {
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto keepdims = ctx.Attr<bool>("keepdims");
-    auto x_rank = x.dims().size();
-    if (axis < 0) axis += x_rank;
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    // if flatten, will construct the new dims for the cacluate
+    framework::DDim x_dims;
+    if (flatten) {
+      x_dims = framework::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) axis += x_dims.size();
+    }
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
   ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
       functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, axis, keepdims)
+  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
 
-    switch (x.dims().size()) {
+    switch (x_dims.size()) {
       case 1:
         CALL_ARG_MINMAX_FUNCTOR(1);
         break;
@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
     const auto& x_dims = ctx->GetInputDim("X");
     int64_t axis = ctx->Attrs().Get<int64_t>("axis");
     bool keepdims = ctx->Attrs().Get<bool>("keepdims");
+    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
 
     PADDLE_ENFORCE_GE(axis, -x_dims.size(),
                       platform::errors::InvalidArgument(
@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
     std::vector<int64_t> vec;
-    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
-    if (keepdims) {
-      vec.push_back(static_cast<int64_t>(1));
+    if (flatten) {
+      // if is flatten, will return the only on element
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+    } else {
+      auto x_rank = x_dims.size();
+      if (axis < 0) axis += x_rank;
+      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+      if (keepdims) {
+        vec.emplace_back(static_cast<int64_t>(1));
+      }
+      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
     }
-    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
     ctx->SetOutputDim("Out", framework::make_ddim(vec));
   }
 };
@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
     AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
     AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Flatten the input value, and search the min or max indices")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
       %s Operator.
 
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index 47d7c8b12243c6c5c501188af7f48f125c266009..23170bf0087906d752767051ce58874cb3584ee5 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -1,29 +1,21 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t>);
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
+REGISTER_OP_CUDA_KERNEL(
+    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
+    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 50797a100b1a67244b7c7b40b47404b60dc6af65..f56789b889526301e670ac37d1d6131aaafb070a 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -32,22 +32,29 @@ class BCELossOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), label_dims.size(),
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Label) shall have the same shape."));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(label_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    auto labels_dims = ctx->GetInputDim("Label");
+
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same rank."
+                          "But received: the rank of Input(X) is [%d], "
+                          "the rank of Input(Label) is [%d].",
+                          rank, labels_dims.size()));
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
     if (check) {
-      PADDLE_ENFORCE_EQ(
-          x_dims.size(), label_dims.size(),
-          platform::errors::InvalidArgument(
-              "ShapeError: Input(X) and Input(Label) shall have the same shape "
-              "But received: the shape of Input(X) is [%s], the shape of "
-              "Input(Label) is [%s].",
-              x_dims, label_dims));
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
     }
 
     ctx->ShareDim("X", "Out");
@@ -76,20 +83,31 @@ class BCELossGradOp : public framework::OperatorWithKernel {
                    framework::GradVarName("X"), "BCELossGrad");
 
     auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
-                               framework::contain_unknown_dim(dout_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
     if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
+                        platform::errors::InvalidArgument(
+                            "Input(X) and Input(Label) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Label) is [%s].",
+                            x_dims, labels_dims));
+
       PADDLE_ENFORCE_EQ(x_dims, dout_dims,
                         platform::errors::InvalidArgument(
-                            "ShapeError:The Input(X) and Input(Out@Grad) "
-                            "should have the same "
-                            "shape, But received: the shape of Input(X) is "
-                            "[%s], the shape of "
-                            "Input(Out@GRAD) is [%s].",
+                            "Input(X) and Input(Out@Grad) shall have the same "
+                            "shape. But received: the shape of Input(X) is "
+                            "[%s], the shape of Input(Out@Grad) is [%s].",
                             x_dims, dout_dims));
     }
+
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 8e30f4eb15b6afde885512206c7eaeb721cdd44b..16db4f05e31d365d8d06174ab708e30474b8a8c2 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -67,7 +67,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
 
     auto x_data = x->data<T>();
     auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
+
     platform::GpuLaunchConfig config =
         platform::getGpuLaunchConfig(x_numel, ctx);
 
@@ -75,7 +76,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
     T* x_cpu_data = x_cpu.data<T>();
 
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
       PADDLE_ENFORCE_GE(
           x_cpu_data[i], static_cast<T>(0),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h
index 85e120e4642a298ebff00fc0e4b6425f775443aa..dd87b69efe2869727f2db778cec44612efbcff6b 100644
--- a/paddle/fluid/operators/bce_loss_op.h
+++ b/paddle/fluid/operators/bce_loss_op.h
@@ -34,11 +34,11 @@ class BCELossOpKernel : public framework::OpKernel<T> {
     auto x_data = x->data<T>();
     auto label_data = labels->data<T>();
     auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    int x_numel = x->numel();
+    auto x_numel = x->numel();
 
     // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
     // x) - label * ln(x)
-    for (int i = 0; i < x_numel; ++i) {
+    for (int64_t i = 0; i < x_numel; ++i) {
       PADDLE_ENFORCE_GE(
           x_data[i], static_cast<T>(0),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/bernoulli_op.cc b/paddle/fluid/operators/bernoulli_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c525da5953d76d4406fbdd0d9d6e98619e409f71
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/bernoulli_op.h"
+
+#include <algorithm>
+#include <string>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+namespace paddle {
+namespace operators {
+
+class BernoulliOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A tensor with probabilities for generating the random binary "
+             "number");
+    AddOutput("Out", "A Tensor filled with random binary number");
+    AddComment(R"DOC(
+This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+
+    Out ~ Bernoulli(X)
+
+)DOC");
+  }
+};
+
+class BernoulliOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    return UnaryOpUnchangedInferShape(ctx);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class BernoulliOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto *in_data = x->data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+    std::uniform_real_distribution<T> dist(0.0, 1.0);
+    auto gen_ptr = framework::Generator::GetInstance();
+    std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
+    }
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+    bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(bernoulli,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
+                       ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0837071d456068f64ebc74b115f1a7904eba41c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -0,0 +1,72 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/bernoulli_op.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+// it can be consistent with cpu when CUDAGenerator is provided.
+template <typename T>
+struct BernoulliCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return static_cast<T>(dist(rng) < p);
+  }
+};
+
+template <typename T>
+class BernoulliOpKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::random_device rd;
+    auto seed = rd();
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto* in_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    platform::Transform<platform::CUDADeviceContext> trans;
+    auto* context =
+        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
+    trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
+          out_data, BernoulliCudaFunctor<T>(seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    bernoulli, ops::BernoulliOpKernel<plat::CUDADeviceContext, float>,
+    ops::BernoulliOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..06a83ada17bb926d6f7d4eef10750986d00f048c
--- /dev/null
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Samples a bernoulli distribution given a probability input
+ */
+
+template <typename T>
+inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
+  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
+                              "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
+                              "The probability should be >= 1, but got %f", p));
+  return static_cast<T>(rand < p);
+}
+
+template <typename DeviceContext, typename T>
+class BernoulliOpKernel;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
index c44299686516e968692fe146a5c324c7f1fa83d2..530147609fe1e47320a1cbd9223ccdfb82ba7e7a 100644
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -63,7 +63,6 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
       for_range(matrix_band_part_functor);
     }
 
-    // TODO(guosheng): Add callback to check info
     auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
     auto* info_ptr = reinterpret_cast<int*>(info->ptr());
 
@@ -96,6 +95,20 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
     }
 #endif
+    // check the info
+    std::vector<int> error_info;  // only for checking positive matrix
+    error_info.resize(batch_count);
+
+    memory::Copy(platform::CPUPlace(), error_info.data(),
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
+
+    for (int i = 0; i < batch_count; ++i) {
+      PADDLE_ENFORCE_EQ(error_info[i], 0,
+                        platform::errors::PreconditionNotMet(
+                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
+                            error_info[i], error_info[i]));
+    }
   }
 
   void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h
index b0280b00ecf447d36b199e6b6765fa7928e081f0..15dd8315362ed0221c5c8b9c523af37da38dfd7e 100644
--- a/paddle/fluid/operators/cholesky_op.h
+++ b/paddle/fluid/operators/cholesky_op.h
@@ -59,22 +59,24 @@ class CholeskyCPUKernel : public framework::OpKernel<T> {
             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
             Eigen::UpLoType::Upper>
             llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
-            llt_decomposition.info(), Eigen::Success,
-            platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
-                "might not be not be positive definite."));
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
+                          platform::errors::InvalidArgument(
+                              "Cholesky decomposition was not successful. The "
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
         output = llt_decomposition.matrixU();
       } else {
         Eigen::LLT<
             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
             Eigen::UpLoType::Lower>
             llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(
-            llt_decomposition.info(), Eigen::Success,
-            platform::errors::InvalidArgument(
-                "Cholesky decomposition was not successful. The input matrice "
-                "might not be not be positive definite."));
+        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
+                          platform::errors::InvalidArgument(
+                              "Cholesky decomposition was not successful. The "
+                              "%d-th input matrice "
+                              "might not be not be positive definite.",
+                              i));
         output = llt_decomposition.matrixL();
       }
     }
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3f9423ae5c26433084eb040a4c38b14feabfe89e..686b3039d4dea93042463250e17f7558a318377c 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -35,5 +35,9 @@ if(WITH_NCCL)
     op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
 endif()
 
+if(WITH_GLOO)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
diff --git a/paddle/fluid/operators/collective/barrier_op.cc b/paddle/fluid/operators/collective/barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f154a42e2be8f825e6b4a386c0c262f31b0edda
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class BarrierOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+class BarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Input data (only used in CUDAKernel).");
+    AddOutput("Out", "(Tensor) Output data (only used in CUDAKernel).");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Barrier Operator - Barrier among all pariticapitors.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(barrier, ops::BarrierOp, ops::BarrierOpMaker);
+REGISTER_OP_CPU_KERNEL(barrier, ops::BarrierOpCPUKernel<int>);
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cad7bda63046fb5c980f9dc1c902d51f51f759
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    ncclRedOp_t nccl_red_type = ncclSum;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
+    auto comm_stream =
+        platform::NCCLCommContext::Instance().Get(rid, place)->stream();
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with NCCL."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(barrier, ops::BarrierOpCUDAKernel<int>);
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a195a43540669921e99ab1d0f6163179ba0455
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/barrier.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::BarrierOptions opts(gloo->GetContext());
+    gloo::barrier(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index fe99a9e128d1892a093c090f33f065ae2a158056..ec55a14d085e5ef23074b4924b453a74e7478216 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -23,6 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allgather.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -30,7 +35,31 @@ template <typename T>
 class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    framework::DDim out_dims = in->dims();
+    auto place = ctx.GetPlace();
+
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    auto nranks = gloo->Size();
+    out_dims[0] *= nranks;
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(out_dims, place);
+
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::AllgatherOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel * nranks);
+    gloo::allgather(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 096a2f6a0959768bcb99d87b0d42edf71d98f481..be518b3bf0a397d7c20f6f1f03878987a43e84dc 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allreduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -50,7 +55,53 @@ template <ReduceType red_type, typename T>
 class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("CAllReduce op do not support CPUKernel for now.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::AllreduceOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel);
+    switch (red_type) {
+      case kRedSum:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::sum<T>));
+        break;
+      case kRedMax:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::max<T>));
+        break;
+      case kRedMin:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::min<T>));
+        break;
+      case kRedProd:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::product<T>));
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "Invalid reduce type: %d.", red_type));
+    }
+    gloo::allreduce(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index 4ceb0aa835fe116cdc14444dfb7ea6046f33c482..eb4acb9a369fc7bfa8e23b5544f54c955d4a87b6 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -22,6 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/broadcast.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -29,7 +34,27 @@ template <typename T>
 class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp.");
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root = ctx.Attr<int>("root");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::BroadcastOptions opts(gloo->GetContext());
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root);
+    gloo::broadcast(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index b460dc40ede31cc66e46e9ada1d4b8b5ed9dba8c..a3bf1f4dfb1cb09fc864f891dda793ecde9027c6 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -52,10 +52,12 @@ class CCommInitOp : public framework::OperatorBase {
     int nranks = Attr<int>("nranks");
     int rank_id = Attr<int>("rank");
     int rid = Attr<int>("ring_id");
-
+    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
+    }
     platform::NCCLCommContext::Instance().CreateNCCLComm(
-        nccl_id, nranks, rank_id,
-        BOOST_GET_CONST(platform::CUDAPlace, place).device, rid);
+        nccl_id, nranks, rank_id, device_id, rid);
 #else
     PADDLE_THROW("PaddlePaddle should compile with GPU.");
 #endif
@@ -74,6 +76,11 @@ Initialize collective communicatoin context within this trainer
     AddAttr<int>("nranks", "(int) The number of ranks of distributed trainers");
     AddAttr<int>("rank",
                  "(int) The rank of the trainer in distributed training.");
+    AddAttr<int>("device_id",
+                 "(int) The deivce_id on which to initialize the communicator."
+                 "Now, you only have to set this attr manually for pipeline "
+                 "training. Otherwise, make it as default.")
+        .SetDefault(-1);
     AddAttr<int>("ring_id", "(int default 0) user specified ring id")
         .SetDefault(0);
   }
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..425351877689f7e3ad8e0a46d2226f5f751a4016
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceMaxOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max, ops::CReduceOp,
+                             ops::CReduceMaxOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMax, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e260346b4bdd8aced0df59c72f5adb4c479e8d0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_max,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e849641e639eeceb48fc95656b269988c827006
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceMinOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min, ops::CReduceOp,
+                             ops::CReduceMinOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedMin, plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77a75ed0b7af2a7946c02bfa0f33038aa0090c5b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_min,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81dc5c35bf14e569fe90743c1dc62a61fd5655ba
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/reduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
+
+class CReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root_id = ctx.Attr<int>("root_id");
+
+    auto place = ctx.GetPlace();
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->mutable_data<T>(in->dims(), place);
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+    gloo::ReduceOptions opts(gloo->GetContext());
+    opts.setInput(const_cast<T*>(send_buff), send_numel);
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root_id);
+    switch (red_type) {
+      case kRedSum:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::sum<T>));
+        break;
+      case kRedMax:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::max<T>));
+        break;
+      case kRedMin:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::min<T>));
+        break;
+      case kRedProd:
+        opts.setReduceFunction(
+            static_cast<void (*)(void*, const void*, const void*, size_t)>(
+                &gloo::product<T>));
+        break;
+      default:
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "Invalid reduce type: %d.", red_type));
+    }
+    gloo::reduce(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    ncclRedOp_t nccl_red_type = ncclSum;
+    switch (red_type) {
+      case kRedSum:
+        nccl_red_type = ncclSum;
+        break;
+
+      case kRedMax:
+        nccl_red_type = ncclMax;
+        break;
+
+      case kRedMin:
+        nccl_red_type = ncclMin;
+        break;
+
+      case kRedProd:
+        nccl_red_type = ncclProd;
+        break;
+
+      default:
+        PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
+                                           "red_type must be one of kRedSum, "
+                                           "kRedMax, kRedMin, kRedProd."));
+    }
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
+        stream));
+#else
+    PADDLE_ENFORCE_EQ(true, false,
+                      platform::errors::Unavailable(
+                          "PaddlePaddle should compile with GPU.."));
+#endif
+  }
+};
+
+class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be reduced.");
+    AddOutput("Out", "(Tensor) the reduced result.");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+CReduce %s Operator
+
+Call collective Reduce with reduce type %s. If input and output are
+the same variable, in-place reduce will be used.
+)DOC",
+                               GetName(), GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64935df856ec79f427bdcd21e03b7c493c31ac1e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceProdOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Prod"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod, ops::CReduceOp,
+                             ops::CReduceProdOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07e431f7bc838caa9bc3abdcd0be1beb94b96635
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_prod,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e20cee7e186a462aedc1881c6e34cacc8d09de0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CReduceSumOpMaker : public CReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Sum"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum, ops::CReduceOp,
+                             ops::CReduceSumOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, float>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, double>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, int64_t>,
+                       ops::CReduceOpCPUKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9826422c16cb67f9f7101643918a83898c606b3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_reduce_sum,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, float>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, double>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, int64_t>,
+                        ops::CReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cc b/paddle/fluid/operators/collective/c_scatter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..908708e6e328f54466d4bb69b30fd607e14d1fe9
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CScatter");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CScatter");
+    int root_id = ctx->Attrs().Get<int>("root");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) must be greater than 1 "
+                          "to use collective op (c_scatter op).",
+                          nranks));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[0] = dim[0] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be broadcasted.");
+    AddOutput("Out", "(Tensor) the result of broadcast.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
+        .SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+CScatter Operator
+Scatter the source to all participators.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_scatter, ops::CScatterOp, ops::CScatterOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_scatter, ops::CScatterOpCPUKernel<float>,
+                       ops::CScatterOpCPUKernel<double>,
+                       ops::CScatterOpCPUKernel<int>,
+                       ops::CScatterOpCPUKernel<int64_t>,
+                       ops::CScatterOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d9e6b4b7d99044f584e9e21062a786252d60f76
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_scatter_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int root_id = ctx.Attr<int>("root");
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    PADDLE_ENFORCE_EQ(nranks, comm->nranks(),
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) you set of must "
+                          "be equal to comm->nranks (%d).",
+                          nranks, comm->nranks()));
+    PADDLE_ENFORCE_GE(
+        root_id, 0,
+        platform::errors::InvalidArgument(
+            "The root_id (%d) for c_scatter_op must be non-negative.",
+            root_id));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_scatter_op must be non-negative.",
+            ring_id));
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    framework::DDim x_dims = x->dims();
+    framework::DDim out_dims(x_dims);
+    framework::Tensor temp;
+    auto out_ptr = temp.mutable_data<T>(out_dims, place);
+    if (root_id == comm->rank()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
+          root_id, comm->comm(), stream));
+
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(&temp));
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+          out_ptr, numel, dtype, root_id, comm->comm(), stream));
+    }
+
+    out_dims[0] = out_dims[0] / nranks;
+    auto start_index = out_dims[0] * comm->rank();
+    auto end_index = start_index + out_dims[0];
+    temp = temp.Slice(start_index, end_index);
+    temp.Resize(out_dims);
+    out->mutable_data<T>(out_dims, place);
+    framework::TensorCopySync(*static_cast<const framework::Tensor*>(&temp),
+                              place, static_cast<framework::Tensor*>(out));
+    out->Resize(out_dims);
+#else
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_scatter, ops::CScatterOpCUDAKernel<float>,
+                        ops::CScatterOpCUDAKernel<double>,
+                        ops::CScatterOpCUDAKernel<int>,
+                        ops::CScatterOpCUDAKernel<int64_t>,
+                        ops::CScatterOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..71a5f488ebc11a93cece9b85f6af288a4662b2d8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/scatter.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CScatterOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_GLOO)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto root_id = ctx.Attr<int>("root");
+
+    auto gloo = paddle::framework::GlooWrapper::GetInstance();
+    PADDLE_ENFORCE_EQ(
+        gloo->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "You must initialize the gloo environment first to use it."));
+
+    int64_t send_numel = out->numel();
+    auto nranks = gloo->Size();
+    auto rank = gloo->Rank();
+    T* recv_buff = out->data<T>();
+    gloo::ScatterOptions opts(gloo->GetContext());
+    if (root_id == rank) {
+      T* send_buff = const_cast<T*>(in->data<T>());
+      std::vector<T*> ptrs(nranks);
+      for (int i = 0; i < nranks; ++i) {
+        ptrs[i] = send_buff;
+        send_buff += send_numel;
+      }
+      opts.setInputs(ptrs, send_numel);
+    }
+    opts.setOutput(recv_buff, send_numel);
+    opts.setRoot(root_id);
+
+    gloo::scatter(opts);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22b212fc1b9f8844f0ae3555ac6d63af1f48d1cd
--- /dev/null
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+#include <algorithm>
+#include <vector>
+
+// This file almostly contains all the infershape functions that are used in
+// operators.
+
+namespace paddle {
+namespace operators {
+namespace details {
+inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
+                                   const framework::DDim &y_dims,
+                                   int *x_dims_array, int *y_dims_array,
+                                   int *out_dims_array, const int max_dim,
+                                   const int axis) {
+  PADDLE_ENFORCE_GE(
+      axis, 0,
+      platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis, max_dim,
+                    platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim, axis));
+  if (x_dims.size() > y_dims.size()) {
+    std::fill(y_dims_array, y_dims_array + axis, 1);
+    if (axis + y_dims.size() < max_dim) {
+      std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1);
+    }
+    std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array);
+    std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array + axis);
+  } else {
+    std::fill(x_dims_array, x_dims_array + axis, 1);
+    if (axis + x_dims.size() < max_dim) {
+      std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1);
+    }
+    std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis);
+    std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array);
+  }
+
+  for (int i = 0; i < max_dim; i++) {
+    PADDLE_ENFORCE_EQ(
+        x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
+            y_dims_array[i] <= 1,
+        true, platform::errors::InvalidArgument(
+                  "Broadcast dimension mismatch. Operands could "
+                  "not be broadcast together with the shape of X = [%s] and "
+                  "the shape of Y = [%s]. Received [%d] in X is not equal to "
+                  "[%d] in Y at i:%d.",
+                  x_dims, y_dims, x_dims_array[i], y_dims_array[i], i));
+    if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
+        (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
+      out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+    } else {
+      out_dims_array[i] = -1;
+    }
+  }
+}
+}  // namespace details
+
+// shape input(0) -> output(0) without change.
+void UnaryOpUnchangedInferShape(framework::InferShapeContext *ctx) {
+  auto x_name = ctx->GetInputNameByIdx(0);
+  auto out_name = ctx->GetOutputNameByIdx(0);
+  ctx->ShareDim(x_name, /*->*/ out_name);
+  ctx->ShareLoD(x_name, /*->*/ out_name);
+}
+
+// shape input(0) -> output(0) without change, check if axis in range [-Rank(x),
+// Rank(x)-1]
+void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext *ctx) {
+  auto x_name = ctx->GetInputNameByIdx(0);
+  auto out_name = ctx->GetOutputNameByIdx(0);
+  auto x_dim = ctx->GetInputDim(x_name);
+  auto x_rank = x_dim.size();
+  auto axis = ctx->Attrs().Get<int>("axis");
+  PADDLE_ENFORCE_GE(
+      axis, -x_rank,
+      platform::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis, x_rank));
+  PADDLE_ENFORCE_LT(
+      axis, x_rank,
+      platform::errors::InvalidArgument(
+          "Attr(axis) value should be in range [-R, R-1], "
+          "R is the rank of Input(X). But received axis: %d, R: %d.",
+          axis, x_rank));
+  ctx->ShareDim(x_name, /*->*/ out_name);
+  ctx->ShareLoD(x_name, /*->*/ out_name);
+}
+
+// broadcast input(0) and input(1) -> output(0)
+void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
+  auto x_name = ctx->GetInputNameByIdx(0);
+  auto y_name = ctx->GetInputNameByIdx(1);
+  auto out_name = ctx->GetOutputNameByIdx(0);
+  auto x_dims = ctx->GetInputDim(x_name);
+  auto y_dims = ctx->GetInputDim(y_name);
+  PADDLE_ENFORCE_EQ(
+      ctx->GetInputsVarType(y_name).front(),
+      framework::proto::VarType::LOD_TENSOR,
+      platform::errors::InvalidArgument(
+          "The var type of input %s should be LoDTensor, but got %s.",
+          ctx->Inputs(y_name).front(), ctx->GetInputsVarType(y_name).front()));
+
+  if (ctx->GetInputsVarType(x_name).front() ==
+      framework::proto::VarType::SELECTED_ROWS) {
+    PADDLE_ENFORCE_EQ(y_dims.size(), 1u,
+                      platform::errors::InvalidArgument(
+                          "For binary broadcastable operator, if X is "
+                          "Sparse(VarType.SELECTED_ROWS"
+                          "), Y must be scalar, and the size of Y should be 1. "
+                          "But reveived the size of Y = %s.",
+                          y_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        y_dims[0], 1,
+        platform::errors::InvalidArgument(
+            "For binary broadcastable operator, if X is "
+            "Sparse(VarType.SELECTED_ROWS"
+            "), Y must be scalar, the first dimension of Y should be 1. "
+            "But reveived the first dimension of Y = %s.",
+            y_dims[0]));
+  } else if (ctx->GetInputsVarType(x_name).front() !=
+             framework::proto::VarType::LOD_TENSOR) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "For binary broadcastable operator, the var type of input X should "
+        "be LOD_TENSOR, but got %s",
+        ctx->GetInputsVarType(x_name).front()));
+  }
+
+  if (x_dims == y_dims) {
+    ctx->ShareDim(x_name, /*->*/ out_name);
+    ctx->ShareLoD(x_name, /*->*/ out_name);
+  } else {
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    int axis = ctx->Attrs().Get<int>("axis");
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    details::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                                    y_dims_array.data(), out_dims_array.data(),
+                                    max_dim, axis);
+    ctx->SetOutputDim(out_name, framework::make_ddim(out_dims_array));
+    ctx->ShareLoD(x_name, /*->*/ out_name);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/common_infer_shape_functions.h b/paddle/fluid/operators/common_infer_shape_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cb9eab2865ce068a4f776bc63070c59bf029481
--- /dev/null
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+// This file almostly contains all the infershape functions that are used in
+// operators.
+
+namespace paddle {
+namespace operators {
+
+// shape input(0) -> output(0) without change.
+void UnaryOpUnchangedInferShape(framework::InferShapeContext* ctx);
+// shape input(0) -> output(0) without change, check if axis in range [-Rank(x),
+// Rank(x)-1]
+void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext* ctx);
+// broadcast input(0) and input(1) -> output(0)
+void BinaryOpBroadcastInferShape(framework::InferShapeContext* ctx);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 060f5412f28e3704e64d33d9a3081a2ca934e918..4f337c03599a548ac3d95ddd06c726be30d7c13f 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -122,12 +122,16 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
              "It has higher priority than Attr(axis). "
              "The shape of AxisTensor must be [1].")
         .AsDispensable();
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
     AddComment(R"DOC(
 Concat Operator.
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 57a5db88c53a393e80204ef94acc9fbc3472e2f7..9ed169fe3502e0c34b9f37d6520edc1a3fbfa91c 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -196,7 +196,7 @@ framework::OpKernelType ConvOp::GetKernelTypeForVar(
     auto ar = paddle::framework::AttrReader(attrs);
     const std::string data_format = ar.Get<std::string>("data_format");
     auto dl = framework::StringToDataLayout(data_format);
-    // Some models may have intentionally set "AnyLayout" for pool
+    // Some models may have intentionally set "AnyLayout" for conv
     // op. Treat this as NCHW (default data_format value)
     if (dl != framework::DataLayout::kAnyLayout) {
       return framework::OpKernelType(expected_kernel_type.data_type_,
@@ -279,12 +279,16 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<bool>("use_quantizer",
-                "(bool, default false) "
-                "Set to true for operators that should be quantized and use "
-                "int8 kernel. "
-                "Only used on CPU.")
+  AddAttr<bool>(
+      "use_quantizer",
+      "(bool, default false) "
+      "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
   AddAttr<bool>("fuse_brelu",
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index b44aa4ce4f893720ef55a7daf1d7b1e757c7480c..25e887ba6675e6c28bcd44c3b57c2ea571c075e3 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -37,6 +37,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   auto filter_dims = ctx->GetInputDim("Filter");
   std::vector<int> output_size =
       ctx->Attrs().Get<std::vector<int>>("output_size");
+  std::vector<int> output_padding =
+      ctx->Attrs().Get<std::vector<int>>("output_padding");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
@@ -78,6 +80,12 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument(
             "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
             "should be the same."));
+  if (output_padding.size())
+    PADDLE_ENFORCE_EQ(
+        output_padding.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
 
   const int64_t C =
       (data_layout != DataLayout::kNHWC ? in_dims[1]
@@ -136,6 +144,27 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                 infer_shape + strides[i]));
       }
       output_shape.push_back(output_size[i]);
+    } else if (output_padding.size()) {
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_GE(
+            output_padding[i], 0,
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should not be "
+                "less than the 0. But received output_padding = "
+                "[%s], whose dim %d is less than 0",
+                framework::make_ddim(output_padding), i));
+        PADDLE_ENFORCE_LT(
+            output_padding[i], std::max(strides[i], dilations[i]),
+            platform::errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should be less "
+                "than either stride or dilation. But received output_size = "
+                "[%s], "
+                "whose dim %d is not less than either stride (%d)  or "
+                "dilation (%d)",
+                framework::make_ddim(output_size), i, strides[i],
+                dilations[i]));
+      }
+      output_shape.push_back((infer_shape + output_padding[i]));
     } else {
       output_shape.push_back(infer_shape);
     }
@@ -223,10 +252,14 @@ void Conv2DTransposeOpMaker::Make() {
            "The format of output tensor is X (one-dimensional) of size equal"
            "to the number of output channels. Only used with MKL-DNN.")
       .AsDispensable();
-
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is the same as input tensor.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
   AddAttr<std::vector<int>>("output_size",
                             "(vector<int> default: []), the "
                             "size of the output tensor")
@@ -338,6 +371,11 @@ void Conv3DTransposeOpMaker::Make() {
             "Where N is batch size, C is "
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>("output_padding",
+                            "(vector<int> default: []), Additional size added "
+                            "to one side of each dimension in the output "
+                            "shape")
+      .SetDefault({});
   AddAttr<std::vector<int>>("output_size",
                             "(vector<int> default: []), the "
                             "size of the output tensor")
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 16e2ca464b5c4de6aa65109cd794d17e4dcd6a2a..7081490fd1bf0e26cb8aa90d69a76a5476cef044 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -24,34 +24,62 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(Weight) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
-                   "Input(Cache) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
-                   "Output(last_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
-                   "Output(last_c) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Reserve"), "Output", "Reserve", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("StateOut"), "Output", "StateOut",
+                   "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("LastH"), "Output", "LastH", "CudnnLSTM");
+    OP_INOUT_CHECK(ctx->HasOutput("LastC"), "Output", "LastC", "CudnnLSTM");
 
     auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
+    auto init_dims = ctx->GetInputDim("InitH");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of Input in CudnnLSTM  must be 3. But "
+                          "received Input's rank is %d.",
+                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(init_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of InitH in CudnnLSTM  must be 3. But "
+                          "received InitH's rank is %d.",
+                          init_dims.size()));
+
+    PADDLE_ENFORCE_EQ(in_dims[1], init_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The in_dims[1] (Input dims) and init_dims[1] (InitH "
+                          "dims) should be equal. But "
+                          "received in_dims[1] is %d and init_dims[1] is %d.",
+                          in_dims[1], init_dims[1]));
+    PADDLE_ENFORCE_EQ(in_dims[2], init_dims[2],
+                      platform::errors::InvalidArgument(
+                          "The in_dims[2] (Input dims) and init_dims[2] (InitH "
+                          "dims) should be equal. But "
+                          "received in_dims[2] is %d and init_dims[2] is %d.",
+                          in_dims[2], init_dims[2]));
 
     auto out_dims = in_dims;
     auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
-    out_dims[2] = hidden_size;
+    bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
+    out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
 
+    auto last_dims = init_dims;
+    last_dims[0] = is_bidirec ? last_dims[0] * 2 : last_dims[0];
     ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
-    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
+    ctx->SetOutputDim("LastH", last_dims);
+    ctx->SetOutputDim("LastC", last_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
   }
 };
 
@@ -84,33 +112,31 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
              " cudnn concatenate all the weight to one Tensor");
-    AddInput("Cache",
-             "The cache of dropout op, a RAW type variable including random "
-             "number generator states and some descriptors, which is used in "
-             "cudnn kernel.")
-        .AsDispensable();
+    AddOutput("Reserve",
+              "(Tensor, a temporary output Tensor to store the reserve_data "
+              "of cudnn kernel.")
+        .AsIntermediate();
+    AddOutput("StateOut",
+              "Share memory with State. "
+              "Store the global drop state when training");
     AddOutput("Out",
               "(Tensor) the hidden state of LSTM operator. "
               "The shape is ( seq_len x batch_size x hidden_size) if "
               "is_bidirec is False"
               "and When is_bidirec is True, the shape will be ( seq_len x "
               "batch_size x hidden_size * 2) ");
-    AddOutput("last_h",
+    AddOutput("LastH",
               "(Tensor) the hidden state of the last step. "
               "The shape is ( num_layers x batch_size x hidden_size) if "
               "is_bidirec is False"
               "and When is_bidirec is True, the shape will be (num_layers*2 x "
               "batch_size x hidden_size)");
-    AddOutput("last_c",
+    AddOutput("LastC",
               "(Tensor) the cell state of the last step"
               "The shape is ( num_layers x batch_size x hidden_size) if "
               "is_bidirec is False"
               "and When is_bidirect is True, the shape will be (num_layers*2 x "
               "batch_size x hidden_size*2)");
-    AddAttr<int>("max_len",
-                 "max length of the LSTM op"
-                 "the first dim of the Input can NOT be greater than max_len")
-        .SetDefault(20);
     AddAttr<float>(
         "dropout_prob",
         "dropout prob of the dropout op"
@@ -120,14 +146,14 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("is_bidirec",
                   "is_bidirec"
                   "if it is bidirectional rnn"
-                  "The will affect the shape of the Out, last_h, and last_c")
+                  "The will affect the shape of the Out, LastH, and LastC")
         .SetDefault(false);
     AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
     AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
     AddAttr<int>("num_layers", "the total layer number of the LSTM")
         .SetDefault(1);
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
     AddComment(R"DOC(
 CUDNN LSTM implementation
 
@@ -172,16 +198,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Cache"),
-                   "Input(last_c) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("InitH"),
-                   "Input(init_h) of LSTM should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasInput("InitC"),
-                   "Input(init_c) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
+    OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
+    OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
 
     auto SetOutGradDim = [&ctx](const std::string& name) {
       auto g_name = framework::GradVarName(name);
@@ -195,6 +215,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
     SetOutGradDim("InitH");
     SetOutGradDim("InitC");
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
 };
 
 template <typename T>
@@ -209,13 +235,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
     op->SetInput("W", this->Input("W"));
-    if (this->HasInput("Cache")) {
-      op->SetInput("Cache", this->Input("Cache"));
-    }
+    op->SetInput("Reserve", this->Output("Reserve"));
+    op->SetInput("StateOut", this->Output("StateOut"));
     op->SetInput("Out", this->Output("Out"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput(framework::GradVarName("last_c"), this->OutputGrad("last_c"));
-    op->SetInput(framework::GradVarName("last_h"), this->OutputGrad("last_h"));
+    op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
+    op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
 
     op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
     op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 579dddee8e82183b778f03595bb4657002262073..37e5e518ea2af9bb437775c8fa7e86816bb1d8ae 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
 
 namespace paddle {
 namespace operators {
@@ -33,8 +34,10 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     auto w = ctx.Input<Tensor>("W");
 
     Tensor *out = ctx.Output<Tensor>("Out");
-    Tensor *last_h = ctx.Output<Tensor>("last_h");
-    Tensor *last_c = ctx.Output<Tensor>("last_c");
+    Tensor *last_h = ctx.Output<Tensor>("LastH");
+    Tensor *last_c = ctx.Output<Tensor>("LastC");
+    Tensor *reserve = ctx.Output<Tensor>("Reserve");
+    Tensor *state_out = ctx.Output<Tensor>("StateOut");
 
     const T *x_data = x->data<T>();
     const T *init_h_data = init_h->data<T>();
@@ -46,72 +49,56 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
     T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
 
-    size_t max_len = ctx.Attr<int>("max_len");
     float dropout_prob = ctx.Attr<float>("dropout_prob");
     bool is_bidirec = ctx.Attr<bool>("is_bidirec");
-    int input_size = ctx.Attr<int>("input_size");
     int hidden_size = ctx.Attr<int>("hidden_size");
     int num_layers = ctx.Attr<int>("num_layers");
     bool is_test = ctx.Attr<bool>("is_test");
+    int seed = ctx.Attr<int>("seed");
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    if (!cache_var) {
-      // The RAW type cache variable wouldn't be created and broadcasted on
-      // multi-devices before the first running.
-      // use parent scope to make cache persistable
-      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
-      auto cache_var_name = ctx.InputNames("Cache")[0];
-      cache_var = scope->Var(cache_var_name);
-    }
-    CudnnRNNCache *cudnn_rnn_cache = nullptr;
-    if (cache_var->IsInitialized()) {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-    } else {
-      // const_cast is usually bad.
-      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
-                            ->GetMutable<CudnnRNNCache>();
-      std::random_device rnd;
-      int seed = ctx.Attr<int>("seed");
-      if (seed == -1) {
-        seed = rnd();
-      }
-
-      auto input_w_numel = w->numel();
-      auto batch_size = x->dims()[1];
-      cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
-                            input_size, hidden_size, num_layers, dropout_prob,
-                            is_bidirec, seed, input_w_numel);
-    }
 
-    auto run_seq_len = x->dims()[0];
+    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+
+    auto input_w_numel = w->numel();
+    auto seq_len = x->dims()[0];
+    auto batch_size = x->dims()[1];
+    auto input_dim = x->dims()[2];
+    size_t reserve_size;
+    bool state_initialized = state_out->IsInitialized() ? true : false;
+    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
+        framework::ToDataType(std::type_index(typeid(T))));
+    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
+                          input_dim, hidden_size, num_layers, dropout_prob,
+                          is_bidirec, seed, input_w_numel, &reserve_size,
+                          state_out, state_initialized, cudnn_type);
+
+    auto *reserve_data = reserve->mutable_data<uint8_t>(
+        {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
 
     if (is_test) {
       // for inference
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
+          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
+          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
+          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
+          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
           cudnn_rnn_cache->workspace_size_));
     } else {
       // for train
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
-          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
-          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
-          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
-          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_,
-          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-          cudnn_rnn_cache->reserve_size_));
+          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
+          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
+          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
+          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
+          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_, reserve_data, reserve_size));
     }
+    delete cudnn_rnn_cache;
   }
 };
 
@@ -123,15 +110,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *weight = ctx.Input<Tensor>("W");
     auto *init_h = ctx.Input<Tensor>("InitH");
     auto *init_c = ctx.Input<Tensor>("InitC");
-    // auto * last_h = ctx.Input<Tensor>("last_h");
-    // auto * last_c = ctx.Input<Tensor>("last_c");
+    auto *reserve = ctx.Input<Tensor>("Reserve");
+    auto *state_out = ctx.Input<Tensor>("StateOut");
+
     auto *out = ctx.Input<Tensor>("Out");
     auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
-    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
-
-    // auto* init_h = ctx.Input<Tensor>("init_h");
-    // auto* init_c = ctx.Input<Tensor>("init_c");
+    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("LastH"));
+    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
 
     auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
@@ -140,116 +125,75 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    auto *cache_var = ctx.InputVar("Cache");
-    PADDLE_ENFORCE(cache_var->IsInitialized());
-    CudnnRNNCache *cudnn_rnn_cache =
-        const_cast<framework::Variable *>(cache_var)
-            ->GetMutable<CudnnRNNCache>();
 
     auto input_dims = input->dims();
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    weight_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-
-    T *init_h_grad_data = NULL;
-    if (init_h_grad == nullptr) {
-      Tensor init_h_grad_temp;
-      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
-
-      init_h_grad_data = init_h_grad_temp.data<T>();
-    } else {
-      init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
-      init_h_grad_data = init_h_grad->data<T>();
-    }
-
-    T *init_c_grad_data = NULL;
-    if (init_c_grad == nullptr) {
-      Tensor init_c_grad_temp;
-      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
 
-      init_c_grad_data = init_c_grad_temp.data<T>();
-    } else {
-      init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
-      init_c_grad_data = init_c_grad->data<T>();
-    }
+    auto *weight_data = weight->data<T>();
+    auto *init_h_data = init_h->data<T>();
+    auto *init_c_data = init_c->data<T>();
+    auto *out_data = out->data<T>();
+    auto *out_grad_data = out_grad->data<T>();
+    auto *last_h_grad_data = last_h_grad->data<T>();
+    auto *last_c_grad_data = last_c_grad->data<T>();
 
-    const T *last_h_grad_data = NULL;
-    if (last_h_grad == nullptr) {
-      Tensor last_h_grad_temp;
-      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
-
-      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
-    } else {
-      last_h_grad_data = last_h_grad->data<T>();
-    }
-
-    const T *last_c_grad_data = NULL;
-    if (last_c_grad == nullptr) {
-      Tensor last_c_grad_temp;
-      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
-      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
-
-      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
-    } else {
-      last_c_grad_data = last_c_grad->data<T>();
-    }
+    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    weight_grad->mutable_data<T>(ctx.GetPlace());
+    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
 
-    const T *out_grad_data = NULL;
-    if (out_grad == nullptr) {
-      Tensor out_grad_temp;
-      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
-      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
+    in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
+    auto *in_grad_data = in_grad->data<T>();
 
-      out_grad_data = (const T *)out_grad_temp.data<T>();
-    } else {
-      out_grad_data = out_grad->data<T>();
-    }
+    init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+    auto *init_h_grad_data = init_h_grad->data<T>();
 
-    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
-    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
+    init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+    auto *init_c_grad_data = init_c_grad->data<T>();
 
-    auto out_data = out->data<T>();
-    // auto out_grad_data = out_grad->data<T>();
-    auto weight_data = weight->data<T>();
-    auto init_h_data = init_h->data<T>();
-    auto init_c_data = init_c->data<T>();
-    auto in_grad_data = in_grad->data<T>();
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    int seed = ctx.Attr<int>("seed");
+
+    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+
+    auto input_w_numel = weight->numel();
+    auto seq_len = input_dims[0];
+    auto batch_size = input->dims()[1];
+    auto input_dim = input->dims()[2];
+    size_t reserve_size;
+    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
+        framework::ToDataType(std::type_index(typeid(T))));
+    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
+                          input_dim, hidden_size, num_layers, dropout_prob,
+                          is_bidirec, seed, input_w_numel, &reserve_size,
+                          const_cast<Tensor *>(state_out), true, cudnn_type);
 
     auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
-    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
+    const uint8_t *reserve_data = reserve->data<uint8_t>();
 
-    auto run_seq_len = input_dims[0];
-    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
-                      "cudnn running seq_len CAN not greater max_lengh");
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
-        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
-        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
-        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
-        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
-        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
-        cudnn_rnn_cache->workspace_size_, reserve_data,
-        cudnn_rnn_cache->reserve_size_));
+        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->y_desc_,
+        out_data, cudnn_rnn_cache->y_desc_, out_grad_data,
+        cudnn_rnn_cache->hy_desc_, last_h_grad_data, cudnn_rnn_cache->cy_desc_,
+        last_c_grad_data, cudnn_rnn_cache->w_desc_, weight_data,
+        cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_,
+        init_c_data, cudnn_rnn_cache->x_desc_, in_grad_data,
+        cudnn_rnn_cache->hx_desc_, init_h_grad_data, cudnn_rnn_cache->cx_desc_,
+        init_c_grad_data, work_data, cudnn_rnn_cache->workspace_size_,
+        const_cast<uint8_t *>(reserve_data), reserve_size));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
-        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
-        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
+        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
+        input->data<T>(), cudnn_rnn_cache->hx_desc_, init_h->data<T>(),
+        cudnn_rnn_cache->y_desc_, out->data<T>(),
         cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
-        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
-        cudnn_rnn_cache->reserve_size_));
+        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->w_desc_,
+        weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+    delete cudnn_rnn_cache;
   }
 };
 
@@ -257,5 +201,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
-REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>,
+                        ops::CudnnLSTMGPUKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>,
+                        ops::CudnnLSTMGPUGradKernel<double>);
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index cd33338abc6223a0ae122cbb60f040562b48a761..13a3e7d09b9f628f31bb9ff3b6137acf6d929c5c 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
@@ -24,16 +25,12 @@ struct CudnnRNNCache {
   CudnnRNNCache() {
     x_desc_ = NULL;
     y_desc_ = NULL;
-    dx_desc_ = NULL;
-    dy_desc_ = NULL;
   }
   ~CudnnRNNCache() { release(); }
 
   cudnnRNNDescriptor_t rnn_desc_;
   cudnnTensorDescriptor_t *x_desc_;
   cudnnTensorDescriptor_t *y_desc_;
-  cudnnTensorDescriptor_t *dx_desc_;
-  cudnnTensorDescriptor_t *dy_desc_;
 
   cudnnTensorDescriptor_t hx_desc_;
   cudnnTensorDescriptor_t cx_desc_;
@@ -55,13 +52,9 @@ struct CudnnRNNCache {
   cudnnFilterDescriptor_t dw_desc_;
 
   size_t workspace_size_;
-  size_t reserve_size_;
-  framework::Tensor reserve_data_;
   framework::Tensor workspace_data_;
 
-  framework::Tensor dropout_state_;
-
-  size_t max_length_;
+  size_t seq_length_;
 
   float dropout_prob_;
   bool is_bidirec_;
@@ -72,10 +65,12 @@ struct CudnnRNNCache {
   int num_layers_;
   int seed_;
 
-  void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len,
+  void init(cudnnHandle_t handle, const platform::Place &place, size_t seq_len,
             int batch_size, int input_size, int hidden_size, int num_layers,
-            float dropout_prob, bool is_bidirec, int seed, int weight_numel) {
-    max_length_ = max_len;
+            float dropout_prob, bool is_bidirec, int seed, int weight_numel,
+            size_t *reserve_size_, framework::Tensor *dropout_state_,
+            bool initialized, cudnnDataType_t cudnn_type) {
+    seq_length_ = seq_len;
     batch_size_ = batch_size;
     input_size_ = input_size;
     hidden_size_ = hidden_size;
@@ -84,55 +79,34 @@ struct CudnnRNNCache {
     is_bidirec_ = is_bidirec;
     seed_ = seed;
 
-    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    int dim_a[3];
-    int stride_a[3];
+    const auto numDirections = is_bidirec_ ? 2 : 1;
+    auto cudnn_size =
+        cudnn_type == CUDNN_DATA_FLOAT ? sizeof(float) : sizeof(double);
+
+    x_desc_ = new cudnnTensorDescriptor_t[seq_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[seq_length_];
+    std::vector<int> dims = {batch_size_, input_size_, 1};
+    std::vector<int> strides = {input_size_, 1, 1};
+
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
 
-    for (size_t i = 0; i < max_length_; ++i) {
+    for (size_t i = 0; i < seq_length_; ++i) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
-      dim_a[0] = batch_size_;
-      dim_a[1] = input_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-      dim_a[0] = batch_size_;
-      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
 
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+          x_desc_[i], cudnn_type, 3, dims.data(), strides.data()));
+
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+          y_desc_[i], cudnn_type, 3, dims_y.data(), strides_y.data()));
     }
 
-    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
-    dim_a[1] = batch_size_;
-    dim_a[2] = hidden_size_;
-
-    stride_a[0] = dim_a[2] * dim_a[1];
-    stride_a[1] = dim_a[2];
-    stride_a[2] = 1;
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
@@ -152,33 +126,44 @@ struct CudnnRNNCache {
         platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        hx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        cx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        hy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        cy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dhx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dcx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dhy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+        dcy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
 
     size_t state_size;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-    dropout_state_.Resize({static_cast<int64_t>(state_size)});
-    auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
-        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
-        seed_));
+    if (!initialized) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state_->Resize({static_cast<int64_t>(state_size)});
+      uint8_t *dropout_state_data =
+          dropout_state_->mutable_data<uint8_t>(place);
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
+          dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+          seed_));
+    } else {
+      uint8_t *dropout_state_data = dropout_state_->data<uint8_t>();
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnRestoreDropoutDescriptor(
+              dropout_desc_, handle, dropout_prob_, dropout_state_data,
+              state_size, 0));
+    }
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
@@ -188,12 +173,12 @@ struct CudnnRNNCache {
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
 #else
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
         rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_DATA_FLOAT));
+        cudnn_type));
 #endif
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -202,48 +187,42 @@ struct CudnnRNNCache {
         platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+        handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
+
+    PADDLE_ENFORCE_EQ(
+        weights_size_, cudnn_size * weight_numel,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
 
-    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
-                      "cudnn lstm weight size should be SAME");
     int dim_w[3];
-    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[0] = weights_size_ / cudnn_size;
     dim_w[1] = 1;
     dim_w[2] = 1;
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
-        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+        w_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
-        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+        dw_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+        handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
-            handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
-
-    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
-    reserve_data_.mutable_data<uint8_t>(place);
+            handle, rnn_desc_, seq_length_, x_desc_, reserve_size_));
 
     workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
     workspace_data_.mutable_data<uint8_t>(place);
   }
 
   void release() {
-    for (size_t i = 0; i < max_length_; ++i) {
+    for (size_t i = 0; i < seq_length_; ++i) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
     }
 
     delete[] x_desc_;
     delete[] y_desc_;
-    delete[] dx_desc_;
-    delete[] dy_desc_;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
index e336e25f0f457d600d96c2059762b66e985a65c7..ab3860ecafc3569c13b0b9e5c882df9ddc03e190 100644
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -36,25 +36,28 @@ class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
     int axis = context.Attr<int>("axis");
     bool exclusive = context.Attr<bool>("exclusive");
     bool reverse = context.Attr<bool>("reverse");
-    auto x_dims = X.dims();
-    if (axis == -1) {
-      axis = x_dims.size() - 1;
+    auto out_dims = Out.dims();
+
+    PADDLE_ENFORCE_EQ(
+        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
+        platform::errors::OutOfRange(
+            "Attr(axis) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+            out_dims.size(), out_dims.size() - 1, axis));
+    if (axis < 0) {
+      axis += out_dims.size();
     }
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument("axis(%d) should be less than the "
-                                          "dimension(%d) of the input tensor.",
-                                          axis, x_dims.size()));
+
     Out.template mutable_data<T>(context.GetPlace());
 
     int pre = 1;
     int post = 1;
-    int mid = x_dims[axis];
+    int mid = out_dims[axis];
     for (int i = 0; i < axis; ++i) {
-      pre *= x_dims[i];
+      pre *= out_dims[i];
     }
-    for (int i = axis + 1; i < x_dims.size(); ++i) {
-      post *= x_dims[i];
+    for (int i = axis + 1; i < out_dims.size(); ++i) {
+      post *= out_dims[i];
     }
 
     auto x = framework::EigenVector<T>::Flatten(X);
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
old mode 100644
new mode 100755
index 962d73d068985a3579cb698a5a545f2c0b0503dc..2e9db16be5530f65e0cf5d4d99ee2ad936105983
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -22,7 +22,14 @@ class CumOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<bool>("flatten")) {
+      ctx->SetOutputDim(
+          "Out",
+          framework::make_ddim({framework::product(ctx->GetInputDim("X"))}));
+    } else {
+      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    }
+
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
@@ -35,8 +42,11 @@ class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("axis",
                  "The dimension to accumulate along. -1 means the last "
                  "dimension [default -1].")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
+        .SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Whether to compute the cumsum over the flattened array. "
+                  "[default false].")
+        .SetDefault(false);
     AddAttr<bool>("exclusive",
                   "Whether to perform exclusive cumsum. [default false].")
         .SetDefault(false);
@@ -63,6 +73,8 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetInput("X", this->OutputGrad("Out"));
     grad_op->SetOutput("Out", this->InputGrad("X"));
     grad_op->SetAttr("axis", BOOST_GET_CONST(int, this->GetAttr("axis")));
+    grad_op->SetAttr("flatten",
+                     BOOST_GET_CONST(bool, this->GetAttr("flatten")));
     grad_op->SetAttr("reverse",
                      !BOOST_GET_CONST(bool, this->GetAttr("reverse")));
     grad_op->SetAttr("exclusive",
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index 7ca5ba3289b26f9b01774b1ab0e85f075c4cfc90..cff0a101e03d54bffd172177726340b3d75b9fe9 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
 #include "paddle/fluid/operators/cum_op.h"
 #include "paddle/fluid/platform/gpu_launch_param_config.h"
 
@@ -251,34 +255,62 @@ class CumCUDAKernel : public framework::OpKernel<T> {
     int axis = context.Attr<int>("axis");
     bool exclusive = context.Attr<bool>("exclusive");
     bool reverse = context.Attr<bool>("reverse");
-    auto in_dims = in->dims();
+    auto out_dims = out->dims();
     auto size = in->numel();
 
-    if (axis == -1) {
-      axis = in_dims.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        axis < out_dims.size() && axis >= (0 - out_dims.size()), true,
+        platform::errors::OutOfRange(
+            "Attr(axis) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+            out_dims.size(), out_dims.size() - 1, axis));
+    if (axis < 0) {
+      axis += out_dims.size();
     }
-    PADDLE_ENFORCE_LT(
-        axis, in_dims.size(),
-        platform::errors::InvalidArgument("axis(%d) should be less than the "
-                                          "dimension(%d) of the input tensor.",
-                                          axis, in_dims.size()));
-
-    int scan_dim_size = in_dims[axis];
-    bool optimize_condition = (axis == (in_dims.size() - 1)) ? true : false;
+
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* in_data = in->data<T>();
+
+    // Use thrust for parallel acceleration when the input size is equal to the
+    // length of the ‘axis’ dimension.
+    if (size == out_dims[axis]) {
+      if (reverse) {
+        thrust::device_ptr<const T> dev_ptr =
+            thrust::device_pointer_cast(in_data);
+        thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
+        if (exclusive) {
+          thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
+                                 out_data);
+        } else {
+          thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(),
+                                 out_data);
+        }
+        thrust::reverse(thrust::device, out_data, out_data + size);
+      } else {
+        if (exclusive) {
+          thrust::exclusive_scan(thrust::device, in_data, in_data + size,
+                                 out_data);
+        } else {
+          thrust::inclusive_scan(thrust::device, in_data, in_data + size,
+                                 out_data);
+        }
+      }
+      return;
+    }
+
+    const int& scan_dim_size = out_dims[axis];
+    bool optimize_condition = (axis == (out_dims.size() - 1)) ? true : false;
     int outer_dim_size = 1;
     int inner_dim_size = 1;
     // treat all dim index < axis as outer_dim_size
     for (size_t i = 0; i < axis; i++) {
-      outer_dim_size *= in_dims[i];
+      outer_dim_size *= out_dims[i];
     }
     // treat all dim index > axis as innner_dim_size
-    for (size_t i = axis + 1; i < in_dims.size(); i++) {
-      inner_dim_size *= in_dims[i];
+    for (size_t i = axis + 1; i < out_dims.size(); i++) {
+      inner_dim_size *= out_dims[i];
     }
 
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* in_data = in->data<T>();
-
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (optimize_condition) {
       auto nextPowerOfTwo = [](int x) -> int {
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index c6140483ff5cb8108895546b6a01f058708231fd..956fd5ad3035434fbf3093786319b3f7ab7e7354 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -68,8 +68,19 @@ class CVMOpKernel : public framework::OpKernel<T> {
 
     // for Input X do not have Lod Information.
     if (x->NumLevels() == 0) {
-      for (int i = 0; i < batch_size; i++) {
-        CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+      if (use_cvm) {
+        for (int i = 0; i < batch_size; i++) {
+          int cursor = i * item_size;
+          y_data[cursor] = log(x_data[cursor] + 1);
+          y_data[cursor + 1] = log(x_data[cursor + 1] + 1) - y_data[cursor];
+          for (int j = 2; j < item_size; j++) {
+            y_data[cursor + j] = x_data[cursor + j];
+          }
+        }
+      } else {
+        for (int i = 0; i < batch_size; i++) {
+          CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
+        }
       }
     } else {
       auto lod = x->lod()[0];
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 16e1699e12c832d54af14f673577dcc32b015d6d..5cd853758926e622d0f87e6f8bbaba2cf3b9f85e 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -222,10 +222,12 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     } else {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
-        if (type == "per_prediction") {
-          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        if (lod[i + 1] > lod[i]) {
+          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+          BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+          if (type == "per_prediction") {
+            ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index bd584d660f7d4f76f9ea30354f3b0c2696b6d048..0d293bb964b615c68891be516534a05cc2277426 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detection/prior_box_op.h"
 
+#include <string>
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -218,12 +220,16 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
     AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67dc2843345682b2dfe3d568e452461860575544
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/diag_v2_op.h"
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class DiagV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "diag_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag_v2");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto offset = ctx->Attrs().Get<int>("offset");
+
+    if (x_dims.size() == 1UL) {
+      int64_t size = x_dims[0] + std::abs(offset);
+      ctx->SetOutputDim("Out", {size, size});
+    } else if (x_dims.size() == 2UL) {
+      int64_t size;
+      if (offset >= 0) {
+        size = std::min(x_dims[0], x_dims[1] - offset);
+      } else {
+        size = std::min(x_dims[0] + offset, x_dims[1]);
+      }
+      ctx->SetOutputDim("Out", {size});
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The input tensor X's dimensions of DiagV2Op should be either 1 or "
+          "2, but received %d.",
+          x_dims.size()));
+    }
+  }
+};
+
+class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor. Its shape is either 1-D or 2-D.");
+    AddOutput("Out", "The output tensor. A square matrix or a vector.");
+    AddAttr<int>("offset",
+                 "The diagonal offset. A positive value represents "
+                 "superdiagonal, 0 represents the main diagonal, and a "
+                 "negative value represents subdiagonal.")
+        .SetDefault(0);
+    AddAttr<float>("padding_value",
+                   "Use this value to fill the area outside the specified "
+                   "diagonal band. Only takes effect when the input is a 1-D "
+                   "Tensor. The default value is 0.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+      If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+      If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
+
+      The argument ``offset`` controls the diagonal offset:
+
+      If ``offset`` = 0, it is the main diagonal.
+
+      If ``offset`` > 0, it is superdiagonal.
+
+      If ``offset`` < 0, it is subdiagonal.
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DiagV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* x_data = X->data<T>();
+    auto x_dims = X->dims();
+    int offset = context.Attr<int>("offset");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+
+    int64_t i;
+    if (x_dims.size() == 1) {
+      float padding_value = context.Attr<float>("padding_value");
+      math::SetConstant<DeviceContext, T> set_padding_value;
+      auto& dev_ctx = context.template device_context<DeviceContext>();
+      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+      auto x_length = x_dims[0];
+      const int& x_stride = ComputeStride(0, x_dims);
+
+      auto out_stride_0 = ComputeStride(0, out_dims);
+      auto out_stride_1 = ComputeStride(1, out_dims);
+      out_data +=
+          (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+      for (i = 0; i < x_length; i++) {
+        out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride];
+      }
+    } else {
+      auto out_length = out_dims[0];
+      const int& x_stride_0 = ComputeStride(0, x_dims);
+      const int& x_stride_1 = ComputeStride(1, x_dims);
+
+      auto out_stride_0 = ComputeStride(0, out_dims);
+      x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+      for (i = 0; i < out_length; i++) {
+        out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    diag_v2, ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::DiagV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4386cc6b8183c03b4d4a19aba7d1126eac2ab495
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -0,0 +1,122 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/diag_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+// Extract the diagonal of a matrix 'x' to a vector 'out'.
+template <typename T>
+__global__ void ExtractDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
+                                      std::ptrdiff_t size,
+                                      const std::ptrdiff_t sumStride,
+                                      const std::ptrdiff_t outStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t xOffset = start + sumStride * idx;
+    out[outStride * idx] = x[xOffset];
+  }
+}
+
+// Paste a vector 'x' to the diagonal of a matrix 'out'
+template <typename T>
+__global__ void PasteDiagonalKernel(T* out, const T* x, std::ptrdiff_t start,
+                                    std::ptrdiff_t x_length,
+                                    const std::ptrdiff_t sumStride,
+                                    const std::ptrdiff_t xStride) {
+  for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < x_length; idx += gridDim.x * blockDim.x) {
+    const std::ptrdiff_t outOffset = start + sumStride * idx;
+    out[outOffset] = x[xStride * idx];
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DiagV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* x_data = X->data<T>();
+    auto x_dims = X->dims();
+    int offset = context.Attr<int>("offset");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (x_dims.size() == 1) {
+      float padding_value = context.Attr<float>("padding_value");
+      math::SetConstant<DeviceContext, T> set_padding_value;
+      set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
+
+      auto x_length = x_dims[0];
+      auto size = (offset > 0) ? x_length + offset : x_length - offset;
+      const int& x_stride = ComputeStride(0, x_dims);
+      if (size > 0) {
+        const int block_num = std::min(static_cast<int>(size),
+                                       dev_ctx.GetMaxPhysicalThreadCount());
+        int size_ = static_cast<int>(size);
+        int block_num_ = static_cast<int>(block_num);
+        const int grid_num =
+            std::min(1024, (size_ + block_num_ - 1) / block_num_);
+        const auto& out_stride_0 = ComputeStride(0, out_dims);
+        const auto& out_stride_1 = ComputeStride(1, out_dims);
+        auto start =
+            (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
+
+        PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
+            out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
+            x_stride);
+      }
+    } else {
+      const int& x_stride_0 = ComputeStride(0, x_dims);
+      const int& x_stride_1 = ComputeStride(1, x_dims);
+
+      int size;
+      if (offset > 0) {
+        size = std::min(x_dims[0], x_dims[1] - offset);
+      } else {
+        size = std::min(x_dims[0] + offset, x_dims[1]);
+      }
+
+      if (size > 0) {
+        const int block_num = std::min(static_cast<int>(size),
+                                       dev_ctx.GetMaxPhysicalThreadCount());
+        int size_ = static_cast<int>(size);
+        int block_num_ = static_cast<int>(block_num);
+        const int grid_num =
+            std::min(1024, (size_ + block_num_ - 1) / block_num_);
+        auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
+        const auto& out_stride_0 = ComputeStride(0, out_dims);
+
+        ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
+            out_data, x_data, start, size, x_stride_0 + x_stride_1,
+            out_stride_0);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    diag_v2, ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DiagV2CUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/fluid/operators/diag_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7850def06117ff4232afe4fca95a3e3e500e876d
--- /dev/null
+++ b/paddle/fluid/operators/diag_v2_op.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using DDim = framework::DDim;
+
+static inline int ComputeStride(int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index cff3993a068ceee1947ca3e17b9cc6a75e3c9ba9..a033611f478f9ea44fd49ab2015e78aaea6aacd9 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -61,7 +61,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv)
+cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
 cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index fb7a0691154de768d4b828ee5d7b6a47755225f4..0d7032e286caab93dbd38f35881e9064694a8307 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -28,6 +28,7 @@
 #include <thread>  // NOLINT
 
 #include <ThreadPool.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -96,7 +97,12 @@ class UniformInitializer : public Initializer {
     dist_ = std::uniform_real_distribution<float>(min_, max_);
   }
 
-  float GetValue() override { return dist_(random_engine_); }
+  float GetValue() override {
+    return framework::Generator::GetInstance()->is_init_py
+               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
+               : dist_(random_engine_);
+    // return dist_(random_engine_);
+  }
 
  private:
   float min_;
@@ -141,7 +147,12 @@ class GaussianInitializer : public Initializer {
     dist_ = std::normal_distribution<float>(mean_, std_);
   }
 
-  float GetValue() override { return dist_(random_engine_); }
+  float GetValue() override {
+    return framework::Generator::GetInstance()->is_init_py
+               ? dist_(framework::Generator::GetInstance()->GetCPUEngine())
+               : dist_(random_engine_);
+    // return dist_(random_engine_);
+  }
 
  private:
   float std_;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 5a67b358ddabb12566cd4ffe00cb12c65a185099..a9378d61c3ca39bd43b558633cc4d04c40175cac 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -110,7 +110,7 @@ void prefetch_core(
   int pservers = context.Attr<int>("pserver_num");
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &actual_ctx = *pool.Get(context.GetPlace());
+  auto &actual_ctx = *pool.Get(platform::CPUPlace());
 
   std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
@@ -144,7 +144,6 @@ void prefetch_core(
       VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
     }
   }
-
   for (size_t i = 0; i < rets.size(); i++) {
     PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
                                                "internal error in RPCClient"));
@@ -167,6 +166,7 @@ void prefetch_core(
       for (int64_t i = 0; i < dims[0]; ++i) {
         auto origin_id = ids_in_this_section[i];
         std::vector<float> vecs(row_numel);
+
         std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
         (*recved_vec_map)[origin_id] = vecs;
       }
@@ -213,18 +213,18 @@ void prefetchs(const std::vector<std::string> &id_var_names,
   const auto place =
       scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
 
-  if (!platform::is_cpu_place(place)) {
-    PADDLE_THROW("multi prefetch only support CPU currently");
-  }
-
+  std::vector<std::vector<int64_t>> ids_group;
   std::vector<int64_t> ids_union;
+  std::vector<framework::LoD> ids_lods;
   TableAndEndpoints tables;
 
   for (auto &id_name : id_var_names) {
-    auto *in_var = scope.FindVar(id_name);
-    auto &id_tensor = in_var->Get<framework::LoDTensor>();
-    std::copy_n(id_tensor.data<int64_t>(), id_tensor.numel(),
-                back_inserter(ids_union));
+    auto &id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+    std::vector<int64_t> ids;
+    TensorToVector(id_tensor, context.device_context(), &ids);
+    ids_union.insert(ids_union.end(), ids.begin(), ids.end());
+    ids_group.push_back(ids);
+    ids_lods.push_back(id_tensor.lod());
   }
 
   std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
@@ -258,25 +258,48 @@ void prefetchs(const std::vector<std::string> &id_var_names,
   }
 
   for (size_t i = 0; i < out_var_names.size(); i++) {
-    auto *in_var = scope.FindVar(id_var_names[i]);
-    auto &id_tensor = in_var->Get<framework::LoDTensor>();
-    auto ids_size = id_tensor.dims()[0];
-    const auto *id_data = id_tensor.data<int64_t>();
-
+    std::vector<int64_t> ids = ids_group[i];
+    auto ids_size = ids.size();
     auto *out_t =
         scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->set_lod(id_tensor.lod());
-    out_t->Resize(framework::make_ddim({ids_size, vec_dim_1}));
+    out_t->set_lod(ids_lods[i]);
+    out_t->Resize(
+        framework::make_ddim({static_cast<int64_t>(ids_size), vec_dim_1}));
     auto *out_d = out_t->mutable_data<float>(place);
 
-    for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-      const auto &id = id_data[idx];
-      if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-        memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-      } else {
-        std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                    out_d + idx * vec_dim_1);
+    if (platform::is_cpu_place(out_t->place())) {
+      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
+        const auto &id = ids[idx];
+        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
+          memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
+        } else {
+          std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
+                      out_d + idx * vec_dim_1);
+        }
+      }
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
+        const auto &id = ids[idx];
+        auto stream = context.cuda_device_context().stream();
+        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
+          platform::GpuMemsetAsync(out_d + idx * vec_dim_1, 0,
+                                   sizeof(float) * vec_dim_1, stream);
+        } else {
+          auto &cpu_place =
+              BOOST_GET_CONST(platform::CPUPlace,
+                              paddle::platform::CPUDeviceContext().GetPlace());
+          auto &gpu_place =
+              BOOST_GET_CONST(platform::CUDAPlace, out_t->place());
+          memory::Copy(gpu_place, out_d + idx * vec_dim_1, cpu_place,
+                       &recved_vec_map[id][0], sizeof(float) * vec_dim_1,
+                       stream);
+        }
       }
+#else
+      PADDLE_ENFORCE(true, platform::errors::PermissionDenied(
+                               "Paddle is not compiled with GPU!"));
+#endif
     }
   }
 }
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
index 3037a63b0d7b4e8812e67fdfb776f89ea43eb546..8c093d12585981ee681ae13f0d2e493197c6b9b3 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -75,47 +73,6 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.InputNames("Ids");
-    auto embedding_name = context.InputNames("W").front();
-    auto out_names = context.OutputNames("Outputs");
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-    auto is_distributed = context.Attr<bool>("is_distributed");
-
-    auto lookup_table_version =
-        context.Attr<std::string>("lookup_table_version");
-
-    operators::distributed::prefetchs(id_names, out_names, embedding_name,
-                                      is_distributed, lookup_tables, endpoints,
-                                      context, context.scope());
-
-    if (lookup_table_version == "lookup_table_v2") {
-      auto &scope = context.scope();
-      auto emb_dim =
-          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
-
-      for (size_t i = 0; i < id_names.size(); ++i) {
-        auto *id_var = scope.FindVar(id_names[i]);
-        auto *out_var = scope.FindVar(out_names[i]);
-        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
-        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-        auto id_dims = id_tensor->dims();
-        out_tensor->Resize(framework::make_ddim(
-            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
-             static_cast<int64_t>(emb_dim)}));
-      }
-    }
-  }
-};
-
 class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -170,15 +127,12 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
-
 This operator is used to perform lookup on parameter W,
 then concatenated into a sparse tensor.
-
 The type of Ids(Input) is SelectedRows, the rows of Ids contains
 the ids to be looked up in W;
 if the Id is not in the sparse table, this operator will return a
 random value and set the value into the table for the next looking up.
-
 )DOC");
   }
 };
@@ -191,4 +145,5 @@ REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
                   ops::DistributedLookupTableOpMaker);
 
 REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<float>);
+                       ops::DistributedLookupTableKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c894415096e869f363eda6a1de2a473e839263
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    distributed_lookup_table,
+    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a71451c78a870b71c05b41bdcfb34a85b3e2213b
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DistributedLookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto ids_vars = context.MultiInputVar("Ids");
+    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
+
+    auto id_names = context.InputNames("Ids");
+    auto embedding_name = context.InputNames("W").front();
+    auto out_names = context.OutputNames("Outputs");
+    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
+    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
+    auto is_distributed = context.Attr<bool>("is_distributed");
+
+    operators::distributed::prefetchs(id_names, out_names, embedding_name,
+                                      is_distributed, lookup_tables, endpoints,
+                                      context, context.scope());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
index ccc30d1ea082a6f69b71059631247144c931116e..d194fcda36a474fa208f5d5a67e425ba5a5a3303 100644
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
@@ -44,7 +44,7 @@ class RecvSaveOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 2580b00d7c2bdfae726b924bb51de199586e12c3..cec706300d77b2b0e66e5b682dfad8536b5dc401 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -26,6 +26,86 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename DeviceContext, typename T>
+void DotGradFunction(const Tensor* tensor_x, const Tensor* tensor_y,
+                     const Tensor* tensor_dout, Tensor* tensor_dx,
+                     Tensor* tensor_dy,
+                     const paddle::framework::ExecutionContext& ctx) {
+#ifdef __NVCC__
+  if (1 == tensor_dout->dims().size()) {
+    auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
+
+    if (tensor_dx) {
+      auto y = framework::EigenVector<T>::Flatten(*tensor_y);
+      auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 1> size(tensor_dx->numel());
+      dx.device(dev) = y * dout.broadcast(size);
+    }
+
+    if (tensor_dy) {
+      auto x = framework::EigenVector<T>::Flatten(*tensor_x);
+      auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 1> size(tensor_dy->numel());
+      dy.device(dev) = x * dout.broadcast(size);
+    }
+  } else {
+    auto dout = EigenMatrix<T>::From(*tensor_dout);
+
+    if (tensor_dx) {
+      tensor_dx->mutable_data<T>(ctx.GetPlace());
+      auto y = EigenMatrix<T>::From(*tensor_y);
+      auto dx = EigenMatrix<T>::From(*tensor_dx);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
+      dx.device(dev) = y * dout.broadcast(size);
+    }
+
+    if (tensor_dy) {
+      tensor_dy->mutable_data<T>(ctx.GetPlace());
+      auto x = EigenMatrix<T>::From(*tensor_x);
+      auto dy = EigenMatrix<T>::From(*tensor_dy);
+      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+      Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
+      dy.device(dev) = x * dout.broadcast(size);
+    }
+  }
+#else
+  const auto* data_dout = tensor_dout->data<T>();
+
+  if (tensor_dx) {
+    auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+    const auto* data_y = tensor_y->data<T>();
+    const framework::DDim& dim = tensor_x->dims();
+    size_t N = static_cast<size_t>(framework::product(dim));
+
+    auto step = dim[dim.size() - 1];
+
+    int s = -1;
+    for (size_t i = 0; i < N; ++i) {
+      if (0 == i % step) ++s;
+      data_dx[i] = data_y[i] * data_dout[s];
+    }
+  }
+
+  if (tensor_dy) {
+    auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+    const auto* data_x = tensor_x->data<T>();
+    const framework::DDim& dim = tensor_y->dims();
+    size_t N = static_cast<size_t>(framework::product(dim));
+
+    auto step = dim[dim.size() - 1];
+
+    int s = -1;
+    for (size_t i = 0; i < N; ++i) {
+      if (0 == i % step) ++s;
+      data_dy[i] = data_x[i] * data_dout[s];
+    }
+  }
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
  public:
@@ -84,83 +164,9 @@ class DotGradKernel : public framework::OpKernel<T> {
 
     if (tensor_dx) tensor_dx->mutable_data<T>(ctx.GetPlace());
     if (tensor_dy) tensor_dy->mutable_data<T>(ctx.GetPlace());
-#ifdef __NVCC__
-    if (1 == tensor_dout->dims().size()) {
-      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
-
-      if (tensor_dx) {
-        auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dx->numel());
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dy->numel());
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    } else {
-      auto dout = EigenMatrix<T>::From(*tensor_dout);
-
-      if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
-        auto y = EigenMatrix<T>::From(*tensor_y);
-        auto dx = EigenMatrix<T>::From(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
-        auto x = EigenMatrix<T>::From(*tensor_x);
-        auto dy = EigenMatrix<T>::From(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    }
-#else
-    const auto* data_dout = tensor_dout->data<T>();
-
-    if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_y = tensor_y->data<T>();
-      const framework::DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
 
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = data_y[i] * data_dout[s];
-      }
-    }
-
-    if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_x = tensor_x->data<T>();
-      const framework::DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = data_x[i] * data_dout[s];
-      }
-    }
-#endif
+    DotGradFunction<DeviceContext, T>(tensor_x, tensor_y, tensor_dout,
+                                      tensor_dx, tensor_dy, ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 676361289e888a5bbd71b63ff16a1ddd7e5dad51..bce4c7ca19a603fd2eadaff7f82b5cdec91bb79f 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -55,6 +56,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
         return;
       }
 
+      bool init_generator_py = framework::Generator::GetInstance()->is_init_py;
+
       // NOTE: fixed seed should only be used in unittest or for debug.
       // Guarantee to use random seed in training.
       std::random_device rnd;
@@ -71,7 +74,11 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       std::uniform_real_distribution<float> dist(0, 1);
 
       for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < dropout_prob) {
+        float cur_random =
+            init_generator_py
+                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
+                : dist(engine);
+        if (cur_random < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 5a398fa50febe2efffd588ce8f3612f1f9cec0b6..457d9e79d7da171ef526d5cab0e59b021cb64f98 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,6 +49,8 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index 60846d1e8fee1c7f68ac101f18355750c2c15a4d..f63d6f037632c1a6a05726b933b2258adc113ee3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,5 +19,7 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 2d24e394d5c823dbd22c837210e46cefeceba1be..a5909aad99a82529a0739cd28b1b72a146524f76 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -24,7 +25,16 @@ namespace operators {
 
 template <typename T>
 struct FloorDivFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return static_cast<T>(floor(a / b));
+  }
+};
+
+template <typename T>
+struct InverseFloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return static_cast<T>(floor(b / a));
+  }
 };
 
 template <typename DeviceContext, typename T>
@@ -32,8 +42,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index af80666b9542db1073f3b07618433671652fffa4..8c2e62bed195f27e228d5dd460ba21ed87c3f5d2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -25,14 +25,14 @@ class ElementwiseModOpMaker : public ElementwiseOpMaker {
 
   void AddInputX() override {
     AddInput("X",
-             "(Variable), Tensor or LoDTensor of any dimensions. Its dtype "
-             "should be int32, int64.");
+             "(Tensor), Tensor of any dimensions. Its dtype "
+             "should be int32, int64, float32 or float64.");
   }
 
   void AddInputY() override {
     AddInput("Y",
-             "(Variable), Tensor or LoDTensor of any dimensions. Its dtype "
-             "should be int32, int64.");
+             "(Tensor), Tensor of any dimensions. Its dtype "
+             "should be int32, int64, float32 or float64.");
   }
 
   std::string GetOpFuntionality() const override {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index f32086c94a99d1267939a421b1c73a858f877556..ece6af1b5a6f562bd7ff81290f98e8636feabb0c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -19,9 +19,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -82,7 +84,13 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       auto y_dims = ctx->GetInputDim("Y");
       int max_dim = std::max(x_dims.size(), y_dims.size());
       int axis = ctx->Attrs().Get<int>("axis");
-      axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+      PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim), true,
+                        platform::errors::InvalidArgument(
+                            "The axis range must be [%s, %s), but axis is %s. "
+                            "Please set the axis again.",
+                            -1 * max_dim, max_dim, axis));
+      axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                       : axis);
       std::vector<int> x_dims_array(max_dim);
       std::vector<int> y_dims_array(max_dim);
       std::vector<int> out_dims_array(max_dim);
@@ -132,20 +140,24 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Y.dimension must be a subsequence of x.dimension. And axis "
                  "is the start dimension index "
                  "for broadcasting Y onto X. ")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
+        .SetDefault(-1);
     AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
         .SetDefault(false);
     AddAttr<std::string>("x_data_format", "This parameter is no longer used.")
         .SetDefault("");
     AddAttr<std::string>("y_data_format", "This parameter is no longer used.")
         .SetDefault("");
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
     AddAttr<float>("Scale_x",
                    "(float, default 1.0f), The quantize scale of X tensor")
         .SetDefault(1.0f);
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..495b640bb4399736456bf391c6522686b9763951
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -0,0 +1,150 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/expand_as_v2_op.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandAsV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
+    OP_INOUT_CHECK(ctx->HasInput("target_tensor"), "Input", "target_tensor",
+                   "ExpandAsV2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
+    auto x_dims = ctx->GetInputDim("X");
+    auto target_tensor_dims = ctx->GetInputDim("target_tensor");
+    PADDLE_ENFORCE_GE(
+        target_tensor_dims.size(), static_cast<size_t>(x_dims.size()),
+        platform::errors::InvalidArgument(
+            "The rank of Input(target_tensor) must be greater than or equal "
+            "to the rank of Input(X). But received Input(X): input "
+            "rank %u, input shape [%s]; received Input(target_tensor): "
+            "input rank %u, input shape [%s].",
+            x_dims.size(), x_dims, target_tensor_dims.size(),
+            target_tensor_dims));
+    PADDLE_ENFORCE_LE(
+        target_tensor_dims.size(), MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of Input(target_tensor) must not be less than or equal "
+            "to %d. But received: input rank %u, input shape [%s].",
+            MAX_RANK_SUPPORTED, x_dims.size(), x_dims));
+    std::vector<int64_t> out_shape(target_tensor_dims.size());
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+  }
+};
+
+class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
+    AddInput("target_tensor", "Expand tensor's shape for each dimension.");
+    AddComment(R"DOC(
+Expand the input by given times number. You should set times
+number for each dimension by providing tensor 'expend_tensor'. The rank of X
+should be in [1, 6]. Please note that size of 'expend_tensor' must be the same
+with X's rank. Following is a using case:
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+target_tensors'shape:  [2, 6, 2]
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+)DOC");
+  }
+};
+
+class ExpandAsV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "ExpandAsV2Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class ExpandAsV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_as_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("target_tensor", this->Input("target_tensor"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
+                  ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
+                  ops::ExpandAsV2GradNoNeedBufVarsInferer);
+REGISTER_OP_CPU_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CPUDeviceContext, bool>);
+REGISTER_OP_CPU_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.cu b/paddle/fluid/operators/expand_as_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e315144472dd9fd4095043e4800a3f276d9314c7
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/expand_as_v2_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4c30dfe1298d1736407c33f40d72dc690046cba
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -0,0 +1,214 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_AS_TEMPLATE(z, n, data) \
+  case n + 1: {                        \
+    ExpandAs<n + 1>(context);          \
+    break;                             \
+  }
+#define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
+#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_AS_GRAD_CASE(n)                                       \
+  case n: {                                                          \
+    ExpandAsBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                           \
+  }
+#define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
+#define REP_EXPAND_AS_GRAD_TEMPLATE(n) \
+  BOOST_PP_REPEAT(n, EXPAND_AS_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ExpandAsV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto target_rank = target_tensor->dims().size();
+    PADDLE_ENFORCE_GE(target_rank, rank,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be greater than or equal to "
+                          "the rank (%d) of the input 'x'.",
+                          target_rank, rank));
+    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
+                                   "The rank (%d) of the input 'x' for "
+                                   "expand_as_v2 op must be positive.",
+                                   rank));
+    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be less than or equal to %d.",
+                          target_rank, MAX_RANK_SUPPORTED));
+
+    switch (target_rank) { REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void ExpandAs(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto in_dims = in0->dims();
+    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto target_shape = framework::vectorize<int>(target_tensor->dims());
+    auto diff = target_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(target_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The value of target shape cannot be zero."));
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i], target_shape[i],
+            platform::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in "
+                "target tensor for expand_as_v2 op.",
+                vec_in_dims[i], target_shape[i]));
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = target_shape[i];
+      }
+    }
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      bcast_dims[i] = repeat_times[i];
+    }
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims = framework::make_ddim(target_shape);
+
+    out0->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ExpandAsV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto x_dims = in0->dims();
+    auto target_shape = target_tensor->dims();
+    auto vec_in_dims = framework::vectorize<int>(x_dims);
+    auto diff = target_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      repeat_times[i] = target_shape[i] / vec_in_dims[i];
+    }
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      reshape_dims_vec.push_back(repeat_times[i]);
+      reshape_dims_vec.push_back(vec_in_dims[i]);
+    }
+
+    int dims = reduce_dims_vec.size();
+    bool just_copy = true;
+    for (size_t i = 0; i < repeat_times.size(); i++) {
+      if (repeat_times[i] != 1) {
+        just_copy = false;
+        break;
+      }
+    }
+    // no need reduce, just copy
+    if (just_copy) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
+    } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_v2_grad op must be greater than or "
+                            "equal to 1, but the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_v2_grad op must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
+      switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandAsBackward(const framework::ExecutionContext& context,
+                        const std::vector<int>& reshape_dims_vec,
+                        const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = reshape_dims_vec.size();
+    size_t reduce_size = reduce_dims_vec.size();
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims)
+            .sum(reduce_dims)
+            .reshape(x_grad.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..359d512c341529579a56dbe840e5eef0aa3062a5
--- /dev/null
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/expand_v2_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandV2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandV2");
+    auto x_dims = ctx->GetInputDim("X");
+    auto expand_shape = ctx->Attrs().Get<std::vector<int>>("shape");
+
+    if (expand_shape.size() == 0) {
+      expand_shape = std::vector<int>(x_dims.size(), -1);
+    }
+
+    PADDLE_ENFORCE_GE(
+        expand_shape.size(), static_cast<size_t>(x_dims.size()),
+        platform::errors::InvalidArgument(
+            "The number of elements (%d) of 'shape' for "
+            "expand_v2 op must be greater than or equal to the rank "
+            "(%d) of the input.",
+            expand_shape.size(), static_cast<size_t>(x_dims.size())));
+    PADDLE_ENFORCE_LE(expand_shape.size(), MAX_RANK_SUPPORTED,
+                      platform::errors::InvalidArgument(
+                          "The number of elements (%d) of 'shape' for "
+                          "must not be greater than %d.",
+                          expand_shape.size(), MAX_RANK_SUPPORTED));
+    PADDLE_ENFORCE_GE(expand_shape.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The number of elements (%d) of 'shape' for "
+                          "must be a positive integer.",
+                          expand_shape.size()));
+
+    auto out_rank =
+        std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
+    std::vector<int64_t> out_shape(out_rank);
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    auto diff = expand_shape.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+    for (size_t i = 0; i < expand_shape.size(); ++i) {
+      if (x_dims[i] == -1) {
+        out_shape[i] = -1;
+      } else if (expand_shape[i] == -1) {
+        out_shape[i] = x_dims[i];
+      } else {
+        PADDLE_ENFORCE_GT(
+            expand_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The %uth element of 'shape' for expand_v2 op must be "
+                "greater than 0, but the value given is %d.",
+                i, expand_shape[i]));
+        out_shape[i] = expand_shape[i];
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "expand_shapes_tensor" || var_name == "Shape") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class ExpandV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
+    AddInput("Shape",
+             "(Tensor<int>), optional). If provided, expand according to "
+             "this given Shape. It has a higher priority than "
+             "expand_shapes_tensor and the shape attribute.")
+        .AsDispensable();
+    AddInput("expand_shapes_tensor",
+             "(Tensor Tensor<int>), epxanded shape for X."
+             "It has a higher priority than shape attribute, but a lower "
+             "priority than the input Shape")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
+    AddAttr<std::vector<int>>("shape", "The expanded shape for each dimension.")
+        .SetDefault({});
+    AddComment(R"DOC(
+Expand the input to the given shape. The rank of X
+should be in [1, 6] and size of 'shape' must be in [1, 6] also.
+Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(shape):  [2, 6, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandV2GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandV2Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "ExpandV2Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    if (expand_shape.size() == 0) {
+      expand_shape = std::vector<int>(x_dims.size(), -1);
+    }
+
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    auto diff = expand_shape.size() - x_dim_vec.size();
+    x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+
+    for (size_t i = 0; i < expand_shape.size(); ++i) {
+      if (expand_shape[i] == -1 || x_dim_vec[i] == -1) {
+        continue;
+      } else {
+        if (ctx->IsRuntime()) {
+          PADDLE_ENFORCE_EQ(
+              expand_shape[i], out_dims[i],
+              platform::errors::InvalidArgument(
+                  "The size (%d) of the dimension %d of Input(Out@GRAD) should "
+                  "be equal to the crroresponding dimension size of shape(%d).",
+                  out_dims[i], i, expand_shape[i]));
+        }
+      }
+    }
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "expand_shapes_tensor" || var_name == "Shape") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
+    op->SetInput("Shape", this->Input("Shape"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
+                  ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
+                  ops::ExpandV2GradNoNeedBufVarsInferer);
+REGISTER_OP_CPU_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, bool>);
+REGISTER_OP_CPU_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_v2_op.cu b/paddle/fluid/operators/expand_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e096dbc27f0c2ae8142da40b9db99074b2719387
--- /dev/null
+++ b/paddle/fluid/operators/expand_v2_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/expand_v2_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec9c6e62f272ed87abc4e0be6ccf1de3aedf15d4
--- /dev/null
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -0,0 +1,296 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+inline std::vector<int> get_expand_shape(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("Shape")) {
+    auto* shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+    auto* shape_data = shape_tensor->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    if (platform::is_gpu_place(shape_tensor->place())) {
+      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      shape_data = cpu_shape_tensor.data<int>();
+    }
+    auto vec_shape =
+        std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+    return vec_shape;
+  }
+
+  auto list_expand_shapes_tensor =
+      ctx.MultiInput<framework::Tensor>("expand_shapes_tensor");
+  if (list_expand_shapes_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_epxand_shape;
+    for (size_t i = 0; i < list_expand_shapes_tensor.size(); ++i) {
+      auto tensor = list_expand_shapes_tensor[i];
+      if (platform::is_gpu_place(tensor->place())) {
+        framework::Tensor temp;
+        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_epxand_shape.push_back(*temp.data<int32_t>());
+      } else {
+        vec_epxand_shape.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_epxand_shape;
+  } else {
+    return ctx.Attr<std::vector<int>>("shape");
+  }
+}
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::To32BitIndex;
+
+template <typename DeviceContext, typename T>
+class ExpandV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'X' for expand_v2 op must be positive, "
+            "but the value received is %d.",
+            rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'X' for expand_v2 op must be less than "
+            "or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto expand_shape = get_expand_shape(context);
+    auto shape_size = expand_shape.size();
+    PADDLE_ENFORCE_GE(
+        shape_size, rank,
+        platform::errors::InvalidArgument(
+            "The number (%d) of elements of 'shape' for expand_v2 op must be "
+            "greater than or equal to the rank (%d) of the input 'X'.",
+            shape_size, rank));
+    PADDLE_ENFORCE_LE(
+        shape_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number (%d) of elements of 'shape' for expand_v2 op must be "
+            "less than or equal to %d.",
+            shape_size, MAX_RANK_SUPPORTED));
+    rank = std::max(rank, static_cast<int>(shape_size));
+    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+
+    auto in_dims = in0->dims();
+    auto expand_shape = get_expand_shape(context);
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto diff = expand_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(expand_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The expanded size cannot be zero."));
+      if (i < diff) {
+        PADDLE_ENFORCE_GT(
+            expand_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The expanded size (%d) for non-existing dimensions must be "
+                "positive for expand_v2 op.",
+                expand_shape[i]));
+        repeat_times[i] = expand_shape[i];
+      } else if (expand_shape[i] > 0) {
+        if (vec_in_dims[i] != 1) {
+          PADDLE_ENFORCE_EQ(
+              vec_in_dims[i], expand_shape[i],
+              platform::errors::InvalidArgument(
+                  "The value (%d) of the non-singleton dimension does not match"
+                  " the corresponding value (%d) in shape for expand_v2 op.",
+                  vec_in_dims[i], expand_shape[i]));
+          repeat_times[i] = 1;
+        } else {
+          repeat_times[i] = expand_shape[i];
+        }
+      } else {
+        PADDLE_ENFORCE_EQ(
+            expand_shape[i], -1,
+            platform::errors::InvalidArgument(
+                "When the value in shape is negative for expand_v2 op, "
+                "only -1 is supported, but the value received is %d.",
+                expand_shape[i]));
+        repeat_times[i] = 1;
+      }
+    }
+
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      bcast_dims[i] = repeat_times[i];
+    }
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+
+    out0->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    // use 32-bit index to speed up
+    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
+    if (use_32bit_index) {
+      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+    } else {
+      y.device(place) = x.broadcast(bcast_dims);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ExpandV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto expand_shape = get_expand_shape(context);
+    auto x_dims = in0->dims();
+    auto vec_in_dims = framework::vectorize<int>(x_dims);
+    auto diff = expand_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    // 1. reshape_dims_vec is the broadcast parameter.
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> repeat_times(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      if (expand_shape[i] < 0) {
+        repeat_times[i] = 1;
+      } else {
+        repeat_times[i] = expand_shape[i] / vec_in_dims[i];
+      }
+    }
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      reshape_dims_vec.push_back(repeat_times[i]);
+      reshape_dims_vec.push_back(vec_in_dims[i]);
+    }
+
+    int dims = reduce_dims_vec.size();
+
+    bool just_copy = true;
+    for (size_t i = 0; i < repeat_times.size(); i++) {
+      if (repeat_times[i] != 1) {
+        just_copy = false;
+        break;
+      }
+    }
+    // no need reduce, just copy
+    if (just_copy) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
+    } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_v2_grad op must be greater than or "
+                            "equal to 1, but the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_v2_grad op must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
+      switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = reshape_dims_vec.size();
+    size_t reduce_size = reduce_dims_vec.size();
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims)
+            .sum(reduce_dims)
+            .reshape(x_grad.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index 2cf08e5c3409acb6b8a43058e143df55fd563d74..793519b40182114c13e63dd32caaa382d55fa52d 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
@@ -93,4 +92,4 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
                        ops::EyeKernel<CPU, double>,
                        ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
+                       ops::EyeKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 0d2b951ee1c544151e99af8216db7809e2a77852..9b0328b0945ba9b57cb9ab27233656e3b0af4f5f 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -37,20 +37,49 @@ template <typename T>
 struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& dev_ctx,
                   const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, framework::Tensor* out) {
+                  const int scale_num, T max_range, const int quant_axis,
+                  framework::Tensor* out) {
     if (scale_num == 1) {
-      const int channel = in->dims()[0];
+      // Dequant op is before quantized op
+      // Dequantize the weight of quantized op
+      auto in_dims = in->dims();
+      const int64_t channel = in_dims[quant_axis];
       const T* scale_factor = scales[0]->data<T>();
-      for (int i = 0; i < channel; i++) {
-        T s = scale_factor[i];
-        framework::Tensor one_channel_in = in->Slice(i, i + 1);
-        framework::Tensor one_channel_out = out->Slice(i, i + 1);
-        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
-        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        auto& dev = *dev_ctx.eigen_device();
-        out_e.device(dev) = in_e * s / max_range;
+      if (quant_axis == 0) {
+        for (int64_t i = 0; i < channel; i++) {
+          T s = scale_factor[i];
+          framework::Tensor one_channel_in = in->Slice(i, i + 1);
+          framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = in_e * s / max_range;
+        }
+      } else if (quant_axis == 1) {
+        int64_t out_iter = 1;
+        for (int i = 0; i < quant_axis; i++) {
+          out_iter *= in_dims[i];
+        }
+        int64_t step_i = in->numel() / out_iter;
+        int64_t step_j = in->numel() / (out_iter * channel);
+        auto* in_data = in->data<T>();
+        auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        for (int64_t i = 0; i < out_iter; i++) {
+          for (int64_t j = 0; j < channel; j++) {
+            auto* cur_in = in_data + i * step_i + j * step_j;
+            auto* cur_out = out_data + i * step_i + j * step_j;
+            T s = scale_factor[j];
+            for (int64_t k = 0; k < step_j; k++) {
+              *cur_out = (*cur_in) * s / max_range;
+              ++cur_in;
+              ++cur_out;
+            }
+          }
+        }
       }
     } else if (scale_num == 2) {
+      // Dequant op is after quantized op
+      // Dequantize the output tensor of quantized op
       int batch_size = in->dims()[0];
       int channel = in->dims()[1];
       const T* scale_one = scales[0]->data<T>();
@@ -157,6 +186,18 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker
         "Quantization bit numbers in quantization stage. "
         "The size of `quant_bits` should be equal to the size of `Scales`.")
         .SetDefault({8});
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
 
     AddComment(R"DOC(
 FakeChannelWiseDequantizeMaxAbsOp operator.
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 02f9dc827d68cbb58447ed1557ff4bf310b2c017..54a92b055a39d49ea061250b066957f933fb975e 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -45,8 +45,9 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
-                                   int num, int channel, T* out) {
+__global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
+                                             T max_range, int num, int channel,
+                                             T* out) {
   int tid = threadIdx.x;
   int channel_size = num / channel;
   const T* in_c = in + blockIdx.x * channel_size;
@@ -56,6 +57,23 @@ __global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
   }
 }
 
+template <typename T>
+__global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
+                                             T max_range, const int num,
+                                             const int cin, const int cout,
+                                             T* out) {
+  int cout_wh_size = num / cin;
+  int wh_size = cout_wh_size / cout;
+
+  T s = scale[blockIdx.x];
+  const T* in_current = in + threadIdx.x * cout_wh_size + blockIdx.x * wh_size;
+  T* out_current = out + threadIdx.x * cout_wh_size + blockIdx.x * wh_size;
+
+  for (int i = 0; i < wh_size; i++) {
+    out_current[i] = in_current[i] * s / max_range;
+  }
+}
+
 template <typename T>
 __global__ void DequantizeTwoScale(const T* in, const T* scale_one,
                                    const T* scale_two, T max_range, int num,
@@ -74,18 +92,29 @@ template <typename T>
 struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& dev_ctx,
                   const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, framework::Tensor* out) {
+                  const int scale_num, T max_range, const int quant_axis,
+                  framework::Tensor* out) {
+    auto in_dims = in->dims();
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
     if (scale_num == 1) {
       int num = in->numel();
-      int channel = in->dims()[0];
       const T* scale_factor = scales[0]->data<T>();
-      int block = 1024;
-      int grid = channel;
-      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          in_data, scale_factor, max_range, num, channel, out_data);
+      if (quant_axis == 0) {
+        int grid = in_dims[0];
+        int block = 1024;
+        DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[0], out_data);
+      } else if (quant_axis == 1) {
+        // Dequantize weight of Cin * Cout * W * H
+        int grid = in_dims[1];
+        int block = in_dims[0];
+        DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
+            out_data);
+      }
     } else if (scale_num == 2) {
+      // Not need to consider quant_axis
       int num = in->numel();
       int batch_size = in->dims()[0];
       int channel = in->dims()[1];
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index 500960098f5ce5e66af5690138c15cc0eaa80d83..6ddb12771fd5176dbe27642adcb2ac82e4d7bfbf 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -33,7 +33,7 @@ template <typename DeviceContext, typename T>
 struct ChannelDequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
                   const framework::Tensor** scales, const int scale_num,
-                  T max_range, framework::Tensor* out);
+                  T max_range, const int quant_axis, framework::Tensor* out);
 };
 
 template <typename DeviceContext, typename T>
@@ -63,6 +63,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::Tensor>("Out");
 
     auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
+    auto quant_axis = ctx.Attr<int>("quant_axis");
     int max_range = 1;
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -70,12 +71,12 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     int scale_num = scales.size();
     if (scale_num == 1) {
       PADDLE_ENFORCE_EQ(
-          scales[0]->numel(), in->dims()[0],
+          scales[0]->numel(), in->dims()[quant_axis],
           platform::errors::PreconditionNotMet(
               "The number of first scale values must be the same with "
-              "first dimension value of Input(X) when the `Scales` has only "
-              "one element, but %ld != %ld here.",
-              scales[0]->numel(), in->dims()[0]));
+              "quant_axis dimension value of Input(X) when the `Scales` has "
+              "only one element, but %ld != %ld here.",
+              scales[0]->numel(), in->dims()[quant_axis]));
       max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
     } else if (scale_num == 2) {
       PADDLE_ENFORCE_EQ(
@@ -94,7 +95,8 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
                    (std::pow(2, quant_bits[1] - 1) - 1);
     }
     ChannelDequantizeFunctor<DeviceContext, T>()(
-        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range),
+        quant_axis, out);
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 358f122c8359fa60f2c99492db8851c8a5fc5293..04ac4a35208a54361a4f434e68095e9519ee12e9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fake_quantize_op.h"
+#include <algorithm>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/clip_op.h"
@@ -39,13 +40,41 @@ template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
-                  const int num, const int channel, T* out) {
-    const int channel_size = num / channel;
-    for (int i = 0; i < channel; i++) {
-      auto* start = in + i * channel_size;
-      auto* end = in + (i + 1) * channel_size;
-      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in_tensor, const int quant_axis,
+                  T* out_abs_max) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    auto* in_data = in_tensor.data<T>();
+    auto in_dims = in_tensor.dims();
+    const int64_t channel = in_dims[quant_axis];
+    if (quant_axis == 0) {
+      const int64_t channel_size = in_tensor.numel() / channel;
+      for (int64_t i = 0; i < channel; i++) {
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        out_abs_max[i] =
+            std::abs(*(std::max_element(start, end, Compare<T>())));
+      }
+    } else if (quant_axis == 1) {
+      for (int64_t i = 0; i < channel; i++) {
+        out_abs_max[i] = 0;
+      }
+      const int64_t step_i = in_tensor.numel() / in_dims[0];
+      const int64_t step_j = in_tensor.numel() / (in_dims[0] * in_dims[1]);
+      for (int64_t i = 0; i < in_dims[0]; i++) {
+        for (int64_t j = 0; j < in_dims[1]; j++) {
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          T abs_max = std::abs(*(std::max_element(start, end, Compare<T>())));
+          out_abs_max[j] = std::max(out_abs_max[j], abs_max);
+        }
+      }
     }
   }
 };
@@ -92,26 +121,53 @@ template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
                   const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int channel,
+                  const int bin_cnt, const int quant_axis,
                   framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
     auto* scale_data = scale.data<T>();
     auto* in_data = in.data<T>();
     auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    const int channel_size = in.numel() / channel;
+    auto in_dims = in.dims();
+    const int64_t channel = in_dims[quant_axis];
     platform::Transform<platform::CPUDeviceContext> trans;
-    for (int i = 0; i < channel; i++) {
-      T s = scale_data[i];
-      auto* start = in_data + i * channel_size;
-      auto* end = in_data + (i + 1) * channel_size;
-      trans(ctx, start, end, out_data + i * channel_size,
-            ClipFunctor<T>(-s, s));
-    }
-    for (int i = 0; i < channel; i++) {
-      T s = scale_data[i];
-      T inv_s = inverse(s);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-      out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+    if (quant_axis == 0) {
+      const int64_t channel_size = in.numel() / channel;
+      for (int64_t i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        trans(ctx, start, end, out_data + i * channel_size,
+              ClipFunctor<T>(-s, s));
+      }
+      for (int64_t i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        T inv_s = inverse(s);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+      }
+    } else if (quant_axis == 1) {
+      const int64_t step_i = in.numel() / in_dims[0];
+      const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
+      for (int i = 0; i < in_dims[0]; i++) {
+        for (int j = 0; j < in_dims[1]; j++) {
+          T s = scale_data[j];
+          T inv_s = inverse(s);
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto* cur_out_data = out_data + i * step_i + j * step_j;
+          trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
+          for (int k = 0; k < step_j; k++) {
+            cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]);
+          }
+        }
+      }
     }
   }
 };
@@ -247,8 +303,9 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
                    "FakeChannelWiseQuantizeAbsMax");
     OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
                    "FakeChannelWiseQuantizeAbsMax");
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -269,6 +326,18 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
               "(Tensor) Output of quantized low level tensor, "
               "but also saved as float data type.");
     AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
         .AddCustomChecker([](const int& bit_length) {
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 75a55fa821f0af664ad18cc20c90cd2f3d61d5d0..6ff3c7ec632f236fe4ae6c6504537df3b8a46b7a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -75,8 +75,8 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
-__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
-                                        T* out) {
+__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
+                                                  const int c, T* out) {
   int tid = threadIdx.x;
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
@@ -100,14 +100,69 @@ __global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
   }
 }
 
+template <typename T>
+__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
+                                                  const int cin, const int cout,
+                                                  T* out) {
+  extern __shared__ T shared_max_data[];
+  int cout_wh_size = n / cin;
+  int wh_size = n / (cin * cout);
+
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  const T* in_current = in + tid * cout_wh_size + bid * wh_size;
+  shared_max_data[tid] = T(0);
+  for (int i = 0; i < wh_size; i++) {
+    T tmp = fabs(in_current[i]);
+    if (tmp > shared_max_data[tid]) {
+      shared_max_data[tid] = tmp;
+    }
+  }
+  __syncthreads();
+
+  int len = blockDim.x;
+  for (int i = (len + 1) / 2; i > 0; len = i, i = (i + 1) / 2) {
+    if (tid < i && tid + i < len &&
+        shared_max_data[tid] < shared_max_data[tid + i]) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    if (i == 1) {
+      i = 0;  // break the loop
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[bid] = shared_max_data[0];
+  }
+}
+
 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, const int channel, T* out) {
-    int block = 1024;
-    int grid = channel;
-    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, channel, out);
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in_tensor, const int quant_axis,
+                  T* out_abs_max) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    const int num = in_tensor.numel();
+    auto in_dims = in_tensor.dims();
+    int channel = in_dims[quant_axis];
+    const T* in_data = in_tensor.data<T>();
+    if (quant_axis == 0) {
+      int grid = channel;
+      int block = 1024;
+      FindChannelAbsMaxKernelQuantAxis0<
+          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+          in_data, num, channel, out_abs_max);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[1];
+      int block = in_dims[0];
+      FindChannelAbsMaxKernelQuantAxis1<
+          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+          in_data, num, in_dims[0], in_dims[1], out_abs_max);
+    }
   }
 };
 
@@ -189,10 +244,12 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
 template struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext,
                                                float>;
 
+// ChannelClipAndQuantKernel for quant_axis is 0
 template <typename T>
-__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
-                                          const int bin_cnt, const int n,
-                                          const int c, T* out) {
+__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
+                                                    const int bin_cnt,
+                                                    const int n, const int c,
+                                                    T* out) {
   int tid = threadIdx.x;
 
   int channel_size = n / c;
@@ -211,22 +268,57 @@ __global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
   }
 }
 
+// ChannelClipAndQuantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantKernelQuantAxis1(const T* in, const T* scale,
+                                                    const int bin_cnt,
+                                                    const int n, const int cin,
+                                                    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v);
+  }
+}
+
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& ctx,
                   const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int channel,
+                  const int bin_cnt, const int quant_axis,
                   framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = channel;
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
 
+    int num = in.numel();
+    auto in_dims = in.dims();
     const T* in_data = in.data<T>();
     const T* scale_data = scale.data<T>();
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
 
-    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, channel, out_data);
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+      ChannelClipAndQuantKernelQuantAxis1<T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 4136217fb0c5f600971c1c04f803b65de9bbecb4..5c6e0b1f6e26d84462a18da910b412f03b93285d 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -61,15 +61,15 @@ struct FindRangeAbsMaxFunctor {
 
 template <typename DeviceContext, typename T>
 struct FindChannelAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const T* in, const int num,
-                  const int channel, T* out);
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in_tensor,
+                  const int quant_axis, T* out_abs_max);
 };
 
 template <typename DeviceContext, typename T>
 struct ChannelClipAndFakeQuantFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& in,
                   const framework::Tensor& scale, const int bin_cnt,
-                  const int channel, framework::Tensor* out);
+                  const int quant_axis, framework::Tensor* out);
 };
 
 template <typename DeviceContext, typename T>
@@ -144,12 +144,13 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                 out_scale_data);
     ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
-        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
+        dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
   }
 };
 
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index f81ed3096238b9da84655c86828040062c96106d..847b24f4f0b0b529cdb36b5823a789c97e0a9d84 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -142,13 +142,17 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
                   "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
     AddAttr<float>("Scale_in",
                    "(float, default 1.0f), The quantize scale of input data")
         .SetDefault(1.0f);
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index aff5f49c73bdce9a5899ed8be17419b8301d52c0..d23beea7e4e62ee65f31c4dc903d80310ddfccbc 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -241,6 +241,156 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FlattenContiguousRange");
+    const auto &start_axis = ctx->Attrs().Get<int>("start_axis");
+    const auto &stop_axis = ctx->Attrs().Get<int>("stop_axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    int in_dims_size = in_dims.size();
+    int real_start_axis = start_axis, real_stop_axis = stop_axis;
+    if (start_axis < 0) {
+      real_start_axis = start_axis + in_dims_size;
+    }
+    if (stop_axis < 0) {
+      real_stop_axis = stop_axis + in_dims_size;
+    }
+    PADDLE_ENFORCE_GE(
+        real_stop_axis, real_start_axis,
+        platform::errors::InvalidArgument("The stop_axis should be greater"
+                                          "than or equal to start_axis."));
+
+    const auto &out_dims =
+        GetOutputShape(real_start_axis, real_stop_axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+
+    OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int start_axis,
+                                             const int stop_axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1;
+    std::vector<int32_t> out_shape;
+    int in_dims_size = in_dims.size();
+    out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+    for (int i = 0; i < start_axis; ++i) {
+      out_shape.push_back(in_dims[i]);
+    }
+    for (int i = start_axis; i <= stop_axis; i++) {
+      outer *= in_dims[i];
+    }
+    out_shape.push_back(outer);
+    for (int i = stop_axis + 1; i < in_dims_size; i++) {
+      out_shape.push_back(in_dims[i]);
+    }
+
+    return out_shape;
+  }
+};
+
+class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("start_axis",
+                 "(int)"
+                 "Indicate the input start dimension (exclusive) to flatten")
+        .SetDefault(1);
+    AddAttr<int>("stop_axis",
+                 "(int)"
+                 "Indicate the input stop dimension (exclusive) to flatten")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a new matrix according to start_axis and stop_axis.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    start_axis = 2, stop_axis = -1
+  We get:
+    Out.shape = (3, 100, 400)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    start_axis = 0, stop_axis = -1
+  We get:
+    Out.shape = (3 * 100 * 100 * 4)
+)DOC");
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+template <typename T>
+class FlattenContiguousRangeGradOpMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("flatten_contiguous_range_grad");
+    grad_op->SetInput("XShape", this->Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("XShape"), "Input", "XShape",
+                   "FlattenContiguousRangeGrad");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "FlattenContiguousRangeGrad");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
 DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -266,6 +416,16 @@ REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
 REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
                   ops::FlattenGradInplaceInferer);
 
+REGISTER_OPERATOR(
+    flatten_contiguous_range, ops::FlattenContiguousRangeOp,
+    ops::FlattenContiguousRangeOpMaker,
+    ops::FlattenContiguousRangeGradOpMaker<paddle::framework::OpDesc>,
+    ops::FlattenContiguousRangeGradOpMaker<paddle::imperative::OpBase>,
+    ops::FlattenOpInplaceInferer);
+REGISTER_OPERATOR(flatten_contiguous_range_grad,
+                  ops::FlattenContiguousRangeGradOp,
+                  ops::FlattenGradInplaceInferer);
+
 REGISTER_OP_CPU_KERNEL(
     flatten, ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
@@ -292,3 +452,26 @@ REGISTER_OP_CPU_KERNEL(
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    flatten_contiguous_range,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      float>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      int8_t>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    flatten_contiguous_range_grad,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          float>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          int>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          int8_t>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          int64_t>);
diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc
index ac4ad8e2dc1c09f5ee9f0adfb8b19e0e4ec374a4..40fda804eaab9d280fb91c97fb4c4983a28487d0 100644
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -42,3 +42,26 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    flatten_contiguous_range,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      float>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      int8_t>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    flatten_contiguous_range_grad,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          int8_t>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>);
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 165832c0e68bdef38f0382ea29f7655a18345805..08efaedccd4f40033dfa02a801911f6666e14ec8 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -112,5 +112,73 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &start_axis = context.Attr<int>("start_axis");
+    auto &stop_axis = context.Attr<int>("stop_axis");
+
+    auto *in = context.Input<framework::LoDTensor>("X");
+    auto x_dims = in->dims();
+    int in_dims_size = x_dims.size();
+    int real_start_axis = start_axis, real_stop_axis = stop_axis;
+    if (start_axis < 0) {
+      real_start_axis = start_axis + in_dims_size;
+    }
+    if (stop_axis < 0) {
+      real_stop_axis = stop_axis + in_dims_size;
+    }
+    auto *out = context.Output<framework::LoDTensor>("Out");
+
+    auto out_dims = framework::make_ddim(
+        GetOutputShape(real_start_axis, real_stop_axis, x_dims));
+
+    out->mutable_data(context.GetPlace(), in->type());
+    framework::TensorCopy(
+        *in, context.GetPlace(),
+        context.template device_context<platform::DeviceContext>(), out);
+    out->Resize(out_dims);
+  }
+  static std::vector<int32_t> GetOutputShape(const int start_axis,
+                                             const int stop_axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1;
+    std::vector<int32_t> out_shape;
+    int in_dims_size = in_dims.size();
+    out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+    for (int i = 0; i < start_axis; ++i) {
+      out_shape.push_back(in_dims[i]);
+    }
+    for (int i = start_axis; i <= stop_axis; i++) {
+      outer *= in_dims[i];
+    }
+    out_shape.push_back(outer);
+    for (int i = stop_axis + 1; i < in_dims_size; i++) {
+      out_shape.push_back(in_dims[i]);
+    }
+
+    return out_shape;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *d_out =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+
+    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(x_dims);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 24f656140f42dfc7ca64afd0e02bdab6e2774244..3fc5f3bfc6b1633ffe835606bbac6118e6b32ca6 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -7,7 +7,12 @@ register_operators(EXCLUDES
     fused_fc_elementwise_layernorm_op
     multihead_matmul_op
     fused_embedding_eltwise_layernorm_op
-    fusion_group_op)
+    fusion_group_op
+    fusion_gru_op)
+
+# fusion_gru_op does not have CUDA kernel
+op_library(fusion_gru_op)
+file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
 
 if (WITH_GPU)
     # fused_bn_activation_op needs cudnn 7.4.1 above
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index e8f371cb4877f343d108e8528345be03cd9b354b..b22f28fbbe3ce8ce178a3d9c17a048817cb750e7 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -216,6 +216,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
       VLOG(3) << "cuDNN forward algo " << algo;
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+              cudnn_output_desc, algo, &workspace_size_in_bytes));
+      if (workspace_size_in_bytes > workspace_size_limit)
+        workspace_size_limit = workspace_size_in_bytes;
     } else {
       std::function<cudnnConvolutionFwdAlgo_t()> search_func =
           [&]() -> cudnnConvolutionFwdAlgo_t {
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index c9e8af6153b672b821355096078e1d186508034c..738e069081511ed2e6df56633971f0db21211ac1 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -22,8 +22,14 @@ class FusionGroupOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    const size_t num_ins = ctx->Inputs("Inputs").size();
-    const size_t num_outs = ctx->Outputs("Outs").size();
+    OP_INOUT_CHECK(ctx->HasInputs("Inputs"), "Input", "Inputs", "FusionGroup");
+    OP_INOUT_CHECK(ctx->HasOutputs("Outs"), "Output", "Outs", "FusionGroup");
+
+    auto input_names = ctx->Inputs("Inputs");
+    auto output_names = ctx->Outputs("Outs");
+
+    const size_t num_ins = input_names.size();
+    const size_t num_outs = output_names.size();
 
     PADDLE_ENFORCE_GE(
         num_ins, 1UL,
@@ -42,9 +48,12 @@ class FusionGroupOp : public framework::OperatorWithKernel {
     std::vector<framework::DDim> x_dims = ctx->GetInputsDim("Inputs");
     if (type == 0) {
       for (size_t i = 1; i < num_ins; ++i) {
-        PADDLE_ENFORCE_EQ(x_dims[0], x_dims[i],
-                          platform::errors::InvalidArgument(
-                              "All the inputs' dims should be the same."));
+        PADDLE_ENFORCE_EQ(
+            x_dims[0], x_dims[i],
+            platform::errors::InvalidArgument(
+                "All the inputs' dims is expected to be the same. "
+                "But recieved [%s] (name: %s) vs [%s] (name: %s).",
+                x_dims[0], input_names[0], x_dims[i], input_names[i]));
       }
       std::vector<framework::DDim> out_dims;
       for (size_t j = 0; j < num_outs; ++j) {
@@ -76,11 +85,11 @@ class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Outs",
               "(std::vector<LoDTensor>) The outputs of fusion_group op.")
         .AsDuplicable();
-    AddAttr<std::vector<std::string>>(
-        "outs_data_type", "The data type of Outputs in fusion_group op.")
+    AddAttr<std::vector<int>>("outs_dtype",
+                              "The data type of Outputs in fusion_group op.")
         .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "inputs_data_type", "The data type of Inputs in fusion_group op.")
+    AddAttr<std::vector<int>>("inputs_dtype",
+                              "The data type of Inputs in fusion_group op.")
         .SetDefault({});
     AddAttr<int>("type", "Fusion type.").SetDefault(0);
     AddAttr<std::string>("func_name", "Name of the generated functions.")
diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h
index 8449c6b63b1a176071c2197de063b90ec2a535eb..5e5f2c60ffbd48d801aa4cff1b074170c44ed88a 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.h
+++ b/paddle/fluid/operators/fused/fusion_group_op.h
@@ -24,14 +24,14 @@ namespace operators {
 
 static void MutableMultiTypeData(
     std::vector<paddle::framework::LoDTensor*>* var,
-    const std::vector<std::string>& data_type, const platform::Place& place) {
+    const std::vector<int>& data_type, const platform::Place& place) {
   for (size_t i = 0; i < var->size(); i++) {
-    if (data_type[i] == "float") {
+    if (data_type[i] == framework::proto::VarType::FP32) {
       (*var)[i]->mutable_data<float>(place);
-    } else if (data_type[i] == "double") {
-      (*var)[i]->mutable_data<double>(place);
-    } else if (data_type[i] == "::paddle::platform::float16") {
+    } else if (data_type[i] == framework::proto::VarType::FP16) {
       (*var)[i]->mutable_data<paddle::platform::float16>(place);
+    } else if (data_type[i] == framework::proto::VarType::FP64) {
+      (*var)[i]->mutable_data<double>(place);
     }
   }
 }
@@ -43,15 +43,15 @@ class FusionGroupKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<framework::LoDTensor>("Inputs");
     auto outs = ctx.MultiOutput<framework::LoDTensor>("Outs");
     int type = ctx.Attr<int>("type");
-    auto outs_type = ctx.Attr<std::vector<std::string>>("outs_data_type");
-    auto inputs_type = ctx.Attr<std::vector<std::string>>("inputs_data_type");
+    const auto& outs_dtype = ctx.Attr<std::vector<int>>("outs_dtype");
+    const auto& inputs_dtype = ctx.Attr<std::vector<int>>("inputs_dtype");
 
     size_t num_ins = ins.size();
     size_t num_outs = outs.size();
 
     auto place = ctx.GetPlace();
 
-    MutableMultiTypeData(&outs, outs_type, place);
+    MutableMultiTypeData(&outs, outs_dtype, place);
 
     std::string func_name = ctx.Attr<std::string>("func_name");
     platform::DeviceCode* dev_code =
@@ -64,22 +64,22 @@ class FusionGroupKernel : public framework::OpKernel<T> {
       args.push_back(&n);
       std::vector<const void*> ptrs(num_ins + num_outs);
       for (size_t i = 0; i < num_ins; ++i) {
-        if (inputs_type[i] == "::paddle::platform::float16") {
+        if (inputs_dtype[i] == framework::proto::VarType::FP16) {
           ptrs[i] = ins[i]->data<paddle::platform::float16>();
-        } else if (inputs_type[i] == "double") {
-          ptrs[i] = ins[i]->data<double>();
-        } else if (inputs_type[i] == "float") {
+        } else if (inputs_dtype[i] == framework::proto::VarType::FP32) {
           ptrs[i] = ins[i]->data<float>();
+        } else if (inputs_dtype[i] == framework::proto::VarType::FP64) {
+          ptrs[i] = ins[i]->data<double>();
         }
         args.push_back(&ptrs[i]);
       }
       for (size_t j = 0; j < num_outs; ++j) {
-        if (outs_type[j] == "::paddle::platform::float16") {
+        if (outs_dtype[j] == framework::proto::VarType::FP16) {
           ptrs[num_ins + j] = outs[j]->data<paddle::platform::float16>();
-        } else if (outs_type[j] == "double") {
-          ptrs[num_ins + j] = outs[j]->data<double>();
-        } else if (outs_type[j] == "float") {
+        } else if (outs_dtype[j] == framework::proto::VarType::FP32) {
           ptrs[num_ins + j] = outs[j]->data<float>();
+        } else if (outs_dtype[j] == framework::proto::VarType::FP64) {
+          ptrs[num_ins + j] = outs[j]->data<double>();
         }
         args.push_back(&ptrs[num_ins + j]);
       }
diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc
index 48e7d6af397849491c1afeb65c02878b88ccd6cf..d50c829b475752cfad5a41500c6d66d1ecc4c8bf 100644
--- a/paddle/fluid/operators/fused/fusion_group_op_test.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc
@@ -57,10 +57,14 @@ framework::OpDesc* CreateFusionGroupOp(
     const std::vector<std::string>& input_names,
     const std::vector<std::vector<int64_t>>& input_shapes,
     const std::vector<std::string>& output_names, int type,
-    const std::vector<std::string>& inputs_data_type,
-    const std::vector<std::string>& outs_data_type, std::string func_name) {
+    std::string func_name) {
   EXPECT_EQ(input_names.size(), input_shapes.size());
 
+  std::vector<int> input_dtypes(input_names.size(),
+                                framework::proto::VarType::FP32);
+  std::vector<int> output_dtypes(output_names.size(),
+                                 framework::proto::VarType::FP32);
+
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto* var = program->MutableBlock(0)->Var(input_names[i]);
     var->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -77,8 +81,8 @@ framework::OpDesc* CreateFusionGroupOp(
   op->SetType("fusion_group");
   op->SetInput("Inputs", input_names);
   op->SetOutput("Outs", output_names);
-  op->SetAttr("inputs_data_type", inputs_data_type);
-  op->SetAttr("outs_data_type", outs_data_type);
+  op->SetAttr("inputs_dtype", input_dtypes);
+  op->SetAttr("outs_dtype", output_dtypes);
   op->SetAttr("type", type);
   op->SetAttr("func_name", func_name);
   op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -133,8 +137,6 @@ void CheckOutputs(framework::Scope* scope,
 void TestMain(const std::vector<std::string>& input_names,
               const std::vector<std::vector<int64_t>>& input_shapes,
               const std::vector<std::string>& output_names, int type,
-              const std::vector<std::string>& inputs_data_type,
-              const std::vector<std::string>& outs_data_type,
               std::string func_name, std::string cuda_kernel_str,
               CPUKernelFunc cpu_kernel_func) {
   // Compile the device code
@@ -144,9 +146,8 @@ void TestMain(const std::vector<std::string>& input_names,
 
   // Create a ProgramDesc that has a fusion_group_op.
   framework::ProgramDesc program;
-  framework::OpDesc* op_desc =
-      CreateFusionGroupOp(&program, input_names, input_shapes, output_names,
-                          type, inputs_data_type, outs_data_type, func_name);
+  framework::OpDesc* op_desc = CreateFusionGroupOp(
+      &program, input_names, input_shapes, output_names, type, func_name);
   auto fusion_group_op = framework::OpRegistry::CreateOp(*op_desc);
 
   framework::Scope scope;
@@ -216,11 +217,8 @@ void elementwise_cuda_kernel_0(size_t n, float *x, float* y, float* z) {
     }
   };
 
-  std::vector<std::string> inputs_data_type(input_names.size(), "float");
-  std::vector<std::string> outs_data_type(output_names.size(), "float");
-  TestMain(input_names, input_shapes, output_names, 0, inputs_data_type,
-           outs_data_type, "elementwise_cuda_kernel_0", kernel,
-           elementwise_cpu_kernel_0);
+  TestMain(input_names, input_shapes, output_names, 0,
+           "elementwise_cuda_kernel_0", kernel, elementwise_cpu_kernel_0);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index f6c8316e2e9fa071dc58fb8fc43baad9055c5475..d0920098f606e49d4d1a3e4cb6d8a2b6c44ca267 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -122,8 +125,17 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+  if (platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
+#endif
   return framework::OpKernelType(
-      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context());
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
 }
 
 void FusionGRUOpMaker::Make() {
@@ -187,6 +199,9 @@ void FusionGRUOpMaker::Make() {
                 "bool"
                 "use origin mode in article https://arxiv.org/abs/1412.3555")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3940aae53b8ef70c15311305ce13f8929400d405
--- /dev/null
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -0,0 +1,439 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fusion_gru_op.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::MKLDNNGetDataType;
+using paddle::platform::MKLDNNMemDesc;
+using platform::to_void_cast;
+
+template <typename T>
+class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
+ public:
+  GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
+                   const platform::MKLDNNDeviceContext& dev_ctx,
+                   const mkldnn::engine mkldnn_engine,
+                   platform::Place cpu_place, const LoDTensor* input,
+                   const Tensor* weight_h, const Tensor* h0,
+                   const bool is_reverse, const int64_t N, const int64_t Ti,
+                   const int64_t IC, const int64_t OC,
+                   const std::string& unique_name)
+      : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            platform::CreateKey(unique_name, Ti)),
+        N(N),
+        Ti(Ti),
+        IC(IC),
+        OC(OC) {
+    // Create memory key without Ti because weights, bias and h0 memories
+    // do not depend on Ti size but primitive and input/output memory do
+    if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
+        platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
+      memory_key_ = unique_name;
+    } else {
+      memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
+    }
+
+    if (!this->isCached()) {
+      // oneDNN kernel has hardcoded activation functions
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<std::string>("gate_activation"), "sigmoid",
+          platform::errors::Unimplemented(
+              "oneDNN fusion_gru supports only sigmoid as a gate activation."));
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<std::string>("activation"), "tanh",
+          platform::errors::Unimplemented(
+              "oneDNN fusion_gru supports only tanh as an activation."));
+
+      // oneDNN RNN dimensions
+      const int64_t D = 1;  // Directions
+      const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
+      const int64_t G = 3;  // Number of Gates, 3 for GRU
+
+      // Create memory descriptors
+      auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
+                                    MKLDNNMemoryFormat::any);
+      auto weight_x_md = MKLDNNMemDesc(
+          {L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+      auto weight_h_md = MKLDNNMemDesc(
+          {L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+      auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
+                                   MKLDNNMemoryFormat::ldgo);
+      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
+                                     MKLDNNMemoryFormat::any);
+      auto h0_md = dnnl::memory::desc();
+      if (h0) {
+        h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                              MKLDNNMemoryFormat::ldnc);
+      }
+
+      // Create GRU oneDNN primitive
+      const auto direction =
+          is_reverse ? dnnl::rnn_direction::unidirectional_right2left
+                     : dnnl::rnn_direction::unidirectional_left2right;
+
+      this->AcquireForwardPrimitiveDescriptor(
+          dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
+          weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
+    }
+  }
+
+  bool is_NTC() {
+    return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
+            dnnl::memory::format_tag::ntc);
+  }
+
+  void reorderRNNdata(const T* input_data, T* output_data,
+                      std::vector<size_t> lod, const bool is_reverse,
+                      platform::RNNReorderType reorder_type) {
+    switch (reorder_type) {
+      // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
+      case platform::RNNReorderType::PP_NTC: {
+        auto* input_data_iter = input_data;
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]) * IC;
+          const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
+          memcpy(output_data + n * Ti * IC + offset, input_data_iter,
+                 sizeof(T) * num_elements);
+          input_data_iter += num_elements;
+        }
+      } break;
+      // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
+      case platform::RNNReorderType::PP_TNC: {
+        auto* input_data_iter = input_data;
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]);
+          const auto offset = is_reverse ? (Ti - num_elements) : 0;
+          for (size_t t = 0; t < num_elements; ++t) {
+            memcpy(output_data + (t + offset) * N * IC + n * IC,
+                   input_data_iter, sizeof(T) * IC);
+            input_data_iter += IC;
+          }
+        }
+      } break;
+      // Reorder output values to PP format [N, T, C] -> [WORDS, C]
+      case platform::RNNReorderType::NTC_PP: {
+        auto* output_data_iter = output_data;
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = (lod[n + 1] - lod[n]) * OC;
+          const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
+          memcpy(output_data_iter, input_data + n * Ti * OC + offset,
+                 sizeof(T) * num_elements);
+          output_data_iter += num_elements;
+        }
+      } break;
+      // Reorder output values to PP format [T, N, C] -> [WORDS, C]
+      case platform::RNNReorderType::TNC_PP: {
+        auto* output_data_iter = output_data;
+        for (int n = 0; n < N; ++n) {
+          const auto num_elements = lod[n + 1] - lod[n];
+          const auto offset = is_reverse ? (Ti - num_elements) : 0;
+          for (size_t t = 0; t < num_elements; ++t) {
+            memcpy(output_data_iter,
+                   input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
+            output_data_iter += OC;
+          }
+        }
+      } break;
+    }
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
+      const LoDTensor* input, const bool is_reverse) {
+    const auto name = this->key_ + "@input_mem";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
+
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
+                                                this->engine_);
+      this->dev_ctx_.SetBlob(name, memory_p);
+    }
+
+    const auto& input_lod = input->lod()[0];
+    auto* x_data = input->data<T>();
+
+    auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
+    memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
+
+    if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
+        dnnl::memory::format_tag::ntc) {
+      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
+                     platform::RNNReorderType::PP_NTC);
+    } else {
+      reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
+                     platform::RNNReorderType::PP_TNC);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
+    const auto name = this->key_ + "@output_mem";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
+
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
+                                                this->engine_);
+      this->dev_ctx_.SetBlob(name, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
+    const std::string h0_key = memory_key_ + "@h0";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
+
+    auto* h0_data = to_void_cast(h0->data<T>());
+
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
+      this->dev_ctx_.SetBlob(h0_key, memory_p);
+    } else {
+      memory_p->set_data_handle(h0_data);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
+                                                     const bool origin_mode) {
+    const std::string wx_key = memory_key_ + "@weight_x";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
+
+    if (!memory_p) {
+      auto user_md =
+          MKLDNNMemDesc({1, 1, IC, 3, OC}, MKLDNNGetDataType<float>(),
+                        MKLDNNMemoryFormat::ldigo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+
+      auto* weight_x_data =
+          reinterpret_cast<float*>(user_memory.get_data_handle());
+      memcpy(weight_x_data, weight_x->data<float>(),
+             sizeof(float) * IC * 3 * OC);
+
+      if (origin_mode == false) {
+        for (int64_t i = 0; i < IC; ++i) {
+          for (int64_t j = 0; j < OC; ++j) {
+            weight_x_data[j] *= -1;
+          }
+          weight_x_data += 3 * OC;
+        }
+      }
+
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_layer_desc(), this->engine_);
+
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_memory, *memory_p)
+          .execute(astream, user_memory, *memory_p);
+
+      this->dev_ctx_.SetBlob(wx_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
+                                                     const bool origin_mode) {
+    const std::string wh_key = memory_key_ + "@weight_h";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
+
+    if (!memory_p) {
+      auto user_md =
+          MKLDNNMemDesc({1, 1, OC, 3, OC}, MKLDNNGetDataType<float>(),
+                        MKLDNNMemoryFormat::ldigo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+
+      // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
+      // oneDNN format [OC, 3OC]
+      auto* weight_h_data =
+          reinterpret_cast<float*>(user_memory.get_data_handle());
+      auto* user_weight_h_data = weight_h->data<float>();
+
+      auto src1_iter = user_weight_h_data;
+      auto src2_iter = user_weight_h_data + 2 * OC * OC;
+
+      for (int64_t c = 0; c < OC; ++c) {
+        memcpy(weight_h_data, src1_iter, 2 * OC * sizeof(float));
+        memcpy(weight_h_data + 2 * OC, src2_iter, OC * sizeof(float));
+
+        src1_iter += 2 * OC;
+        src2_iter += OC;
+        weight_h_data += 3 * OC;
+      }
+
+      weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle());
+
+      if (origin_mode == false) {
+        for (int64_t i = 0; i < OC; ++i) {
+          for (int64_t j = 0; j < OC; ++j) {
+            weight_h_data[j] *= -1;
+          }
+          weight_h_data += 3 * OC;
+        }
+      }
+
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_iter_desc(), this->engine_);
+
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_memory, *memory_p)
+          .execute(astream, user_memory, *memory_p);
+
+      this->dev_ctx_.SetBlob(wh_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias,
+                                                  const bool origin_mode) {
+    const std::string bias_key = memory_key_ + "@bias";
+    auto memory_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(bias_key));
+
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
+                                                this->engine_);
+      auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
+      if (bias) {
+        const float* user_bias_data =
+            bias->data<float>();  // Bias in oneDNN is always float
+        memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OC);
+      } else {
+        // oneDNN always need bias memory, if it's not provided in PP, let
+        // oneDNN allocate memory and set it to 0
+        memset(bias_data, 0, sizeof(float) * 3 * OC);
+      }
+
+      if (origin_mode == false && bias) {
+        for (int64_t i = 0; i < OC; ++i) {
+          bias_data[i] *= -1;
+        }
+      }
+      this->dev_ctx_.SetBlob(bias_key, memory_p);
+    }
+    return memory_p;
+  }
+
+ private:
+  // RNN dimensions
+  // N - Batch Size
+  // Ti - Max sentence length
+  // IC - Input Channels
+  // OC - Output Channels
+  const int64_t N, Ti, IC, OC;
+
+  // Memory size of weights, bias and h0 does not depend
+  // on Ti size, thus we need another key to cache them
+  std::string memory_key_;
+};
+
+template <typename T>
+class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    // Get Tensors
+    const auto* input = ctx.Input<LoDTensor>("X");
+    const auto* h0 = ctx.Input<Tensor>("H0");
+    const auto* weight_x = ctx.Input<Tensor>("WeightX");
+    const auto* weight_h = ctx.Input<Tensor>("WeightH");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* hidden = ctx.Output<LoDTensor>("Hidden");
+
+    // Get attributes
+    const bool is_reverse = ctx.Attr<bool>("is_reverse");
+    const bool origin_mode = ctx.Attr<bool>("origin_mode");
+
+    // Get tensor dimensions
+    const auto x_dims = framework::vectorize(input->dims());
+    const auto weight_h_dims = framework::vectorize(weight_h->dims());
+    const auto& input_lod = input->lod()[0];
+
+    // Calculate RNN dimensions
+    const int64_t N = input_lod.size() - 1;  // Number of sentences (batches)
+    const int64_t Ti =  // Max length of the sentence in a batch
+        [&input_lod]() {
+          size_t res = 0;
+          for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
+            res = std::max(res, input_lod[i + 1] - input_lod[i]);
+          }
+          return res;
+        }();
+    const int64_t IC = x_dims[1];         // Input channels
+    const int64_t OC = weight_h_dims[0];  // Output channels
+
+    GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
+                                input, weight_h, h0, is_reverse, N, Ti, IC, OC,
+                                ctx.InputName("X") + ctx.InputName("WeightH"));
+
+    auto input_memory_p =
+        handler.AcquireInputMemoryWithReorder(input, is_reverse);
+    auto weight_x_memory_p =
+        handler.AcquireWeightXMemory(weight_x, origin_mode);
+    auto weight_h_memory_p =
+        handler.AcquireWeightHMemory(weight_h, origin_mode);
+    auto bias_memory_p = handler.AcquireBiasMemory(bias, origin_mode);
+    auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
+
+    std::unordered_map<int, dnnl::memory> gru_args = {
+        {DNNL_ARG_SRC_LAYER, *input_memory_p},
+        {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
+        {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
+        {DNNL_ARG_BIAS, *bias_memory_p},
+        {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
+
+    if (h0) {
+      auto h0_memory_p = handler.AcquireH0Memory(h0);
+      gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
+    }
+
+    auto gru_forward_p = handler.AcquireForwardPrimitive();
+
+    dnnl::stream astream(mkldnn_engine);
+    gru_forward_p->execute(astream, gru_args);
+    astream.wait();
+
+    auto* hidden_onednn_data =
+        reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
+    auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
+    if (handler.is_NTC()) {
+      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
+                             is_reverse, platform::RNNReorderType::NTC_PP);
+    } else {
+      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
+                             is_reverse, platform::RNNReorderType::TNC_PP);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
+                   ops::FusionGRUMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index f59d46ec79bd0960392ed1b8b3c8ee27b2317e39..c4bdd9e439c54db03f8fa8c4fe439ed6edbd0c7a 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -158,5 +159,133 @@ void GPUGatherNd(const framework::ExecutionContext& context,
       end_size);
 }
 
+template <typename T, typename U>
+__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
+                                int outer_dim_size, int inner_dim_size,
+                                int out_index_dim_size,
+                                int input_index_dim_size, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    int inner_dim_index = idx / (outer_dim_size * out_index_dim_size);
+    int next_idx = idx % (outer_dim_size * out_index_dim_size);
+    int index_dim_index = next_idx / (outer_dim_size);
+    int out_dim_index = next_idx % outer_dim_size;
+    int input_index =
+        inner_dim_index * (outer_dim_size * input_index_dim_size) +
+        index[index_dim_index] * outer_dim_size + out_dim_index;
+    out[idx] = input[input_index];
+  }
+}
+
+template <typename T, typename U>
+__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+                                    int outer_dim_size, int inner_dim_size,
+                                    int input_index_dim_size,
+                                    int out_index_dim_size, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    int inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
+    int next_idx = idx % (outer_dim_size * input_index_dim_size);
+    int index_dim_index = next_idx / (outer_dim_size);
+    int out_dim_index = next_idx % outer_dim_size;
+    int out_index = inner_dim_index * (outer_dim_size * out_index_dim_size) +
+                    index[index_dim_index] * outer_dim_size + out_dim_index;
+    paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
+  }
+}
+
+template <typename T, typename U, typename V>
+void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
+                          const Tensor* axis, Tensor* out,
+                          const paddle::platform::Place& place,
+                          const framework::ExecutionContext& ctx) {
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+  auto* index_data = index->data<U>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  Tensor cpu_axis;
+  framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis);
+  int axis_index = cpu_axis.data<V>()[0];
+  int index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+  std::vector<int> out_dim_vec;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  out_dim_vec.push_back(index_size);
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  auto out_dim = framework::make_ddim(out_dim_vec);
+
+  out->Resize(out_dim);
+  auto* out_data = out->mutable_data<T>(place);
+  int out_size = out->numel();
+
+  int threads = 512;
+  int grid = (out_size + threads - 1) / threads;
+  auto stream = ctx.cuda_device_context().stream();
+  GatherGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      index_size, index_dim_size, out_size);
+}
+
+template <typename T, typename U, typename V>
+void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
+                              const Tensor* axis, Tensor* out,
+                              const paddle::platform::Place& place,
+                              const framework::ExecutionContext& ctx) {
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  Tensor cpu_axis;
+  framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis);
+  int axis_index = cpu_axis.data<V>()[0];
+  int input_index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+  }
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+  }
+
+  auto* out_data = out->mutable_data<T>(place);
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto out_dim = out->dims();
+  int out_index_dim_size = out_dim[axis_index];
+  operators::math::set_constant(*dev_ctx, out, 0.0);
+
+  int threads = 512;
+  int grid = (input_size + threads - 1) / threads;
+  auto stream = ctx.cuda_device_context().stream();
+  GatherGradGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      input_index_dim_size, out_index_dim_size, input_size);
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index f5a7bffe4745360a307a4b7c61b30c871cf6c756..c12a3b8adc97893f523b307a56c0e6b04ea8d675 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once
 #include <memory.h>
 #include <cstring>
+#include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -124,5 +126,110 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
   }
 }
 
+template <typename T, typename U, typename V>
+void GatherV2Function(const Tensor* input, const Tensor* index,
+                      const Tensor* axis, Tensor* out,
+                      const paddle::platform::Place& place) {
+  auto* axis_data = axis->data<V>();
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  int index_size = index->numel();
+  int input_size = input->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  int axis_index = axis_data[0];
+
+  int input_index_dim_size = input_dim[axis_index];
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
+                      platform::errors::InvalidArgument(
+                          "The element of Index must be less than the size of "
+                          "input dim size of axis which is %d, but received "
+                          "index element which is %d in the %d index.",
+                          input_index_dim_size, index_data[i], i));
+  }
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+  std::vector<int> out_dim_vec;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  out_dim_vec.push_back(index_size);
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+    out_dim_vec.push_back(input_dim[i]);
+  }
+  auto out_dim = framework::make_ddim(out_dim_vec);
+
+  out->Resize(out_dim);
+  auto* out_data = out->mutable_data<T>(place);
+
+  int out_index = 0;
+  for (int i = 0; i < inner_dim_size; i++) {
+    for (int j = 0; j < index_size; j++) {
+      for (int k = 0; k < outer_dim_size; k++) {
+        int index = k + index_data[j] * outer_dim_size +
+                    (i * input_size / inner_dim_size);
+        out_data[out_index] = input_data[index];
+        out_index++;
+      }
+    }
+  }
+}
+
+template <typename T, typename U, typename V>
+void GatherV2GradFunction(const Tensor* input, const Tensor* index,
+                          const Tensor* axis, Tensor* out,
+                          const paddle::platform::Place& place) {
+  auto* axis_data = axis->data<V>();
+  auto* index_data = index->data<U>();
+
+  int axis_size = axis->numel();
+  auto input_dim = input->dims();
+  auto* input_data = input->data<T>();
+
+  if (input->numel() == 0) return;
+  PADDLE_ENFORCE_EQ(axis_size, 1,
+                    platform::errors::InvalidArgument(
+                        "Axis size should be 1, but received %d", axis_size));
+  int axis_index = axis_data[0];
+  int input_index_dim_size = input_dim[axis_index];
+
+  int inner_dim_size = 1;
+  int outer_dim_size = 1;
+
+  for (int i = 0; i < axis_index; i++) {
+    inner_dim_size *= input_dim[i];
+  }
+  for (int i = axis_index + 1; i < input_dim.size(); i++) {
+    outer_dim_size *= input_dim[i];
+  }
+
+  auto* out_data = out->mutable_data<T>(place);
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto out_dim = out->dims();
+  int out_index_dim_size = out_dim[axis_index];
+  operators::math::set_constant(*dev_ctx, out, 0.0);
+
+  for (int i = 0; i < inner_dim_size; i++) {
+    for (int j = 0; j < input_index_dim_size; j++) {
+      for (int k = 0; k < outer_dim_size; k++) {
+        int index = k + index_data[j] * outer_dim_size +
+                    i * outer_dim_size * out_index_dim_size;
+        out_data[index] += input_data[j * outer_dim_size + k];
+      }
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index c22c8a18ca63a05265ac6991cf0e0cbd9e7ea5ed..1427bd04d3442be26be931ca31bf358ebd23efae 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -45,7 +45,7 @@ class GatherNdOp : public framework::OperatorWithKernel {
         index_dims[index_dims_size - 1], x_dims_size,
         platform::errors::InvalidArgument(
             "Input(Index).shape[-1] should be no greater than Input(X).rank"));
-    PADDLE_ENFORCE_GE(index_dims_size, 2UL,
+    PADDLE_ENFORCE_GE(index_dims_size, 1UL,
                       platform::errors::InvalidArgument(
                           "The rank of Input(Index) should be greater than 1"));
 
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6a3abaa600281ac4a9762d5c73d398974abbf041..8a3450d1df97a2e99711f9ae029ca2668f38e2b0 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -78,6 +78,9 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
+    AddInput("Axis",
+             "The Tensor which contains the axis that we do gather operation.")
+        .AsDispensable();
     AddOutput("Out", "The output of gather op");
     AddAttr<bool>(
         "overwrite",
@@ -120,6 +123,8 @@ class GatherGradOpMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("gather_grad");
     op->SetInput("Index", this->Input("Index"));
+    op->SetInput("Axis", this->Input("Axis"));
+
     op->SetInput("X", this->Input("X"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 5bef547c0542b922f646f72ffb7310ef4eb279e9..37fbfb21f60a0568390c6798dc305c91fc8af886 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -31,6 +31,33 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2CUDAFunction<T, int32_t, int32_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2CUDAFunction<T, int32_t, int64_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2CUDAFunction<T, int64_t, int32_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2CUDAFunction<T, int64_t, int64_t>(x, index, axis, output, place,
+                                                  ctx);
+      }
+      return;
+    }
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     const auto &index_type = index->type();
@@ -64,6 +91,34 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradCUDAFunction<T, int32_t, int32_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradCUDAFunction<T, int32_t, int64_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradCUDAFunction<T, int64_t, int32_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradCUDAFunction<T, int64_t, int64_t>(dO, index, axis, dX,
+                                                      place, ctx);
+      }
+      return;
+    }
+
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index e4ce13ca8fc0b49e997749d0f47f15213a3b44f7..8ec0d6ce0b69c791f9bff58f1681f8d4543c57dd 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -35,6 +35,30 @@ class GatherOpKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2Function<T, int32_t, int32_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2Function<T, int32_t, int64_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2Function<T, int64_t, int32_t>(x, index, axis, output, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2Function<T, int64_t, int64_t>(x, index, axis, output, place);
+      }
+      return;
+    }
+
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
 
@@ -70,6 +94,30 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    if (ctx.HasInput("Axis")) {
+      const Tensor *axis = ctx.Input<Tensor>("Axis");
+      const auto &index_type = index->type();
+      const auto &axis_type = axis->type();
+      auto place = ctx.GetPlace();
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradFunction<T, int32_t, int32_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT32 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradFunction<T, int32_t, int64_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT32) {
+        GatherV2GradFunction<T, int64_t, int32_t>(dO, index, axis, dX, place);
+      }
+      if (index_type == framework::proto::VarType::INT64 &&
+          axis_type == framework::proto::VarType::INT64) {
+        GatherV2GradFunction<T, int64_t, int64_t>(dO, index, axis, dX, place);
+      }
+      return;
+    }
+
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 253078751ce66dd2a6d52dbdd5fe6b5c0ed21849..111d4ad4490074fb53671f6f3180cf17c5abe913 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <random>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -31,22 +33,29 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
     std::normal_distribution<T> dist(mean, std);
-
     const std::string op_type = "gaussian_random";
     auto shape = GetShape(context, op_type);
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(gen_engine);
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 3bf34fc685ee8af39b66f444c35d606c4b5d8ffb..93f9e108723fbd56e0d3bf5d439614c2c20bb393 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -41,13 +41,14 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
 
     int n = input->dims()[0];
     int c = input->dims()[1];
-    int h = input->dims()[2];
-    int w = input->dims()[3];
-    const int size[4] = {n, c, h, w};
+    int out_h = grid->dims()[1];
+    int out_w = grid->dims()[2];
+    const int size[4] = {n, c, out_h, out_w};
 
     const T* input_data = input->data<T>();
     const T* grid_data = grid->data<T>();
-    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    T* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
 
     ScopedSpatialTransformerDescriptor st_desc;
     cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
@@ -97,7 +98,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
     const T* grid_data = grid->data<T>();
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
+        input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
     T* grid_grad_data =
         grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 5be490379642e8761a6821fa0dc0d332ca5b41ef..deb71b807128e5c0b173b517e60832894ced41e5 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -58,21 +59,10 @@ class GridSampleOp : public framework::OperatorWithKernel {
               "Input(X) and Input(Grid) dimension[0] should be equal, but "
               "received X dimension[0](%d) != Grid dimension[0](%d)",
               x_dims[0], grid_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[1], x_dims[2],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[2] and Input(Grid) dims[1] should be equal, but "
-              "received X dimension[2](%d) != Grid dimension[1](%d)",
-              x_dims[2], grid_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          grid_dims[2], x_dims[3],
-          platform::errors::InvalidArgument(
-              "Input(X) dims[3] and Input(Grid) dims[2] should be equal, but "
-              "received X dimension[3](%d) != Grid dimension[2](%d)",
-              x_dims[3], grid_dims[2]));
     }
 
-    ctx->SetOutputDim("Output", x_dims);
+    ctx->SetOutputDim("Output",
+                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
     ctx->ShareLoD("X", "Output");
   }
 
@@ -108,15 +98,37 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default true) Only used in cudnn kernel, need install cudnn")
         .SetDefault(true);
 
+    AddAttr<bool>(
+        "align_corners",
+        "(bool, default true) If align_corners is true, it will project"
+        "-1 and 1 to the centers of the corner pixels. Otherwise, it will "
+        "project"
+        "-1 and 1 to the image edges.")
+        .SetDefault(true);
+
+    AddAttr<std::string>(
+        "mode",
+        "(bool, default true) The interpolation method which can be 'bilinear'"
+        " or 'nearest'.")
+        .SetDefault("bilinear");
+
+    AddAttr<std::string>(
+        "padding_mode",
+        "(bool, default true) The padding method used when source"
+        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "'border'.")
+        .SetDefault("zeros");
+
     AddComment(R"DOC(
-      This operation samples input X by using bilinear interpolation based on 
+      This operation samples input X by using bilinear or nearest interpolation based on 
       flow field grid, which is usually generated by affine_grid. The grid of
       shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
       with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
       (in width dimension) of input data x and grid_y is indexing the 3rd 
       dimension (in height dimension), finally results is the bilinear 
-      interpolation value of 4 nearest corner points.
+      interpolation value or nearest value of 4 nearest corner points.
 
+      For bilinear interpolation mode:
       Step 1:
         Get (x, y) grid coordinates and scale to [0, H-1/W-1].
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..999f990448ca6370dadacbdaee5bf3bcadcaca0e
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -0,0 +1,490 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/grid_sampler_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename T>
+static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
+                                                  int sW, int H, int W,
+                                                  T delta) {
+  if (in_bounds(h, w, H, W)) {
+    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize(T coord, int size,
+                                                 bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
+                                                    int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T compute_positions(T coord, int size,
+                                                      PaddingMode padding_mode,
+                                                      bool align_corners) {
+  coord = _unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_indexes(coord, -1, 2 * size - 1);
+    }
+    coord = clip_indexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
+                                                           bool align_corners,
+                                                           T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
+                                                           T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
+                            bool align_corners, T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = clip_indexes_with_mask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
+                                        int out_h, int out_w, int in_h,
+                                        int in_w, const T* input, const T* grid,
+                                        T* output, const Mode mode,
+                                        const PaddingMode padding_mode,
+                                        bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = compute_positions(ix, in_w, padding_mode, align_corners);
+    iy = compute_positions(iy, in_h, padding_mode, align_corners);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(round(ix));
+      int iy_nearest = static_cast<int>(round(iy));
+
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+    auto* output = ctx.Output<Tensor>("Output");
+    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    VLOG(3) << "set constant";
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        dev_ctx, output, static_cast<T>(0));
+    int count = static_cast<int>(n * out_h * out_w);
+
+    auto cu_stream = dev_ctx.stream();
+
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
+        grid->data<T>(), output_data, mode, padding_mode, align_corners);
+  }
+};
+
+template <typename T>
+__global__ void grid_sampler_cuda_backward_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* grid,
+    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
+    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
+    bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
+                                     &gix_mult);
+    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
+                                     &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
+                   nw * gOut);
+        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
+                   ne * gOut);
+        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
+                   sw * gOut);
+        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
+                   se * gOut);
+
+        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = gix_mult * gix;
+      gGrid_ptr_NHW[1] = giy_mult * giy;
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
+                   in_w, grad_output[gOut_offset]);
+      }
+
+      T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+      gGrid_ptr_NHW[0] = static_cast<T>(0);
+      gGrid_ptr_NHW[1] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T>
+class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
+    auto mode_s = ctx.Attr<std::string>("mode");
+
+    PaddingMode padding_mode;
+    Mode mode;
+    if (padding_mode_s == "border") {
+      padding_mode = PaddingMode::border;
+    } else if (padding_mode_s == "reflect") {
+      padding_mode = PaddingMode::reflect;
+    } else {
+      padding_mode = PaddingMode::zeros;
+    }
+
+    if (mode_s == "nearest") {
+      mode = Mode::nearest;
+    } else {
+      mode = Mode::bilinear;
+    }
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        input_grad, static_cast<T>(0));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    grid_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
+        grid_grad, static_cast<T>(0));
+
+    int count = static_cast<int>(n * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    int block = 512;
+    int grid_size = (count + block - 1) / block;
+    grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
+        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad->data<T>(),
+        mode, padding_mode, align_corners);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
+                        ops::GridSampleOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
+                        ops::GridSampleGradOpCUDAKernel<float>,
+                        ops::GridSampleGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 08a6043eb07a6e44d46428ee195f6cb28c2ee77c..eda800e78faf5da2bb379b8101e4823c5bc2d2f8 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <iostream>
+#include <string>
+#include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
@@ -22,6 +25,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+enum class Mode {
+  bilinear,
+  nearest,
+};
+
+enum class PaddingMode { zeros, border, reflect };
+
 using Tensor = framework::Tensor;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -39,64 +49,229 @@ static inline bool isInBound(T x, T y, T x_max, T y_max) {
 }
 
 template <typename T>
-static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, Tensor* x_w, Tensor* x_e,
-                              Tensor* y_n, Tensor* y_s, Tensor* d_w,
-                              Tensor* d_e, Tensor* d_n, Tensor* d_s) {
+static inline void unnormalize(const platform::CPUDeviceContext& ctx,
+                               Tensor* grid_slice,
+                               const int max_val,  // height-1 or width-1
+                               bool align_corners) {
   auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+static inline void clip(const platform::CPUDeviceContext& ctx,
+                        Tensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners, std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflect") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <typename T>
+static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners, std::string padding_mode,
+                                Tensor* grid_slice, Tensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
+
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflect") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+
+template <typename T>
+static void calcGridLocations(const platform::CPUDeviceContext& ctx,
+                              const Tensor& grid, const int in_h,
+                              const int in_w, bool align_corners,
+                              std::string padding_mode, Tensor* grid_x,
+                              Tensor* grid_y) {
   const int n = grid.dims()[0];
-  const int h = grid.dims()[1];
-  const int w = grid.dims()[2];
-  const T x_max = static_cast<T>(w - 1);
-  const T y_max = static_cast<T>(h - 1);
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
 
   // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  Tensor grid_x, grid_y;
-  T* grid_x_data = grid_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  T* grid_y_data = grid_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * h * w; i++) {
+  for (int i = 0; i < n * out_h * out_w; i++) {
     grid_x_data[i] = grid_data[2 * i];
     grid_y_data[i] = grid_data[(2 * i) + 1];
   }
 
-  Tensor ones;
-  ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
-  Tensor half_xmax;
-  Tensor half_ymax;
-  half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_xmax_t =
-      EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
-  half_ymax.mutable_data<T>({n, h, w}, ctx.GetPlace());
-  auto half_ymax_t =
-      EigenTensor<T, 3>::From(half_ymax).setConstant(0.5 * y_max);
-
-  // scale grid to [0, h-1/w-1]
-  auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-  grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t;
-  grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t;
+  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
+                                      const Tensor& grid, const int in_h,
+                                      const int in_w, bool align_corners,
+                                      std::string padding_mode, Tensor* grid_x,
+                                      Tensor* grid_y, Tensor* grid_x_scale,
+                                      Tensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
 
+  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
+                  grid_x_scale);
+  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
+                  grid_y_scale);
+}
+
+template <typename T>
+static void getGridPointValue(const Tensor& input, Tensor* output,
+                              const Tensor& x, const Tensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void allNeigbors(const platform::CPUDeviceContext& ctx,
+                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
+                        Tensor* y_s,  // positions
+                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
+                        Tensor* d_s,  // distance
+                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
+                        Tensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
   // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   auto x_w_t = EigenTensor<T, 3>::From(*x_w);
   auto x_e_t = EigenTensor<T, 3>::From(*x_e);
   auto y_n_t = EigenTensor<T, 3>::From(*y_n);
   auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
   x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + ones_t;
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
   y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + ones_t;
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
 
   // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
   auto d_w_t = EigenTensor<T, 3>::From(*d_w);
   auto d_e_t = EigenTensor<T, 3>::From(*d_e);
   auto d_n_t = EigenTensor<T, 3>::From(*d_n);
@@ -105,28 +280,100 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
   d_e_t.device(place) = x_e_t - grid_x_t;
   d_n_t.device(place) = grid_y_t - y_n_t;
   d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  getGridPointValue<T>(input, v_en, *x_e, *y_n);
+  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  getGridPointValue<T>(input, v_es, *x_e, *y_s);
 }
 
 template <typename T>
-static void GetGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
+static void bilinearInter(const platform::CPUDeviceContext& ctx,
+                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                          Tensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
   const int c = input.dims()[1];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
+                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <typename T>
+static void nearestInter(const platform::CPUDeviceContext& ctx,
+                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
+                         Tensor* out) {
+  auto& place = *ctx.eigen_device();
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+  grid_x_t = grid_x_t.round();
+  grid_y_t = grid_y_t.round();
+  getGridPointValue<T>(input, out, *grid_x, *grid_y);
+}
+
+template <typename T>
+static void gatherOutputGradToInputGrad(const Tensor& output_grad,
+                                        Tensor* input_grad, const Tensor& x,
+                                        const Tensor& y, const Tensor& d1,
+                                        const Tensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
   auto x_t = EigenTensor<T, 3>::From(x);
   auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
 
   for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
           for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
+            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
           }
         }
       }
@@ -135,29 +382,28 @@ static void GetGridPointValue(const Tensor& input, Tensor* output,
 }
 
 template <typename T>
-static void GatherOutputGradToInputGrad(const Tensor& output_grad,
+static void gatherOutputGradToInputGrad(const Tensor& output_grad,
                                         Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
+                                        const Tensor& y) {
   const int n = output_grad.dims()[0];
   const int c = output_grad.dims()[1];
-  const int h = output_grad.dims()[2];
-  const int w = output_grad.dims()[3];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
   auto x_t = EigenTensor<T, 3>::From(x);
   auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
   for (int i = 0; i < n; i++) {
-    for (int k = 0; k < h; k++) {
-      for (int l = 0; l < w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
+                      (T)(in_h - 1))) {
           for (int j = 0; j < c; j++) {
             input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
                          static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+                output_grad_t(i, j, k, l);
           }
         }
       }
@@ -165,65 +411,126 @@ static void GatherOutputGradToInputGrad(const Tensor& output_grad,
   }
 }
 
+template <typename T>
+static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
+                               const Tensor& input, const Tensor& output_grad,
+                               Tensor* grid_x, Tensor* grid_y,
+                               Tensor* grid_x_scale, Tensor* grid_y_scale,
+                               Tensor* input_grad, Tensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  allNeigbors<T>(ctx, input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
+                 &v_ws, &v_es);
+
+  // gather output grad value to input grad by corner point coords and weight
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  Tensor grid_grad_x, grid_grad_y;
+  grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
+  auto grid_grad_x_t =
+      EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+  auto grid_grad_y_t =
+      EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < c; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          grid_grad_x_t(i, k, l) +=
+              ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+               (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+              output_grad_t(i, j, k, l);
+          grid_grad_y_t(i, k, l) +=
+              ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+               (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+              output_grad_t(i, j, k, l);
+        }
+      }
+    }
+  }
+
+  //  const T x_max = static_cast<T>(in_w - 1);
+  //  const T y_max = static_cast<T>(in_h - 1);
+
+  auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+  auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+  grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+  grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+
+  // gather grid_grad [x, y] in 3rd Dim
+  T* grid_grad_data = grid_grad->data<T>();
+  T* grid_grad_x_data = grid_grad_x.data<T>();
+  T* grid_grad_y_data = grid_grad_y.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_grad_data[2 * i] = grid_grad_x_data[i];
+    grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+  }
+}
+
 template <typename DeviceContext, typename T>
 class GridSampleOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode = ctx.Attr<std::string>("padding_mode");
+    auto mode = ctx.Attr<std::string>("mode");
+
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
 
-    const int n = input->dims()[0];
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
     const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    // calc locations and distances of 4 corner points
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
     auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
 
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-    auto d_w_scaled_t =
-        d_w_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_e_scaled_t =
-        d_e_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_n_scaled_t =
-        d_n_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto d_s_scaled_t =
-        d_s_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-    auto output_t = EigenTensor<T, 4>::From(*output);
-    // bilinear interpolaetion by 4 corner points
-    output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                             v_en_t * d_w_scaled_t * d_s_scaled_t +
-                             v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                             v_es_t * d_w_scaled_t * d_n_scaled_t;
+    Tensor grid_x, grid_y;
+    calcGridLocations<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
+        in_w, align_corners, padding_mode, &grid_x, &grid_y);
+    if (mode == "bilinear") {
+      bilinearInter<T>(
+          ctx.template device_context<platform::CPUDeviceContext>(), *input,
+          &grid_x, &grid_y, output);
+    } else if (mode == "nearest") {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      getGridPointValue<T>(*input, output, grid_x, grid_y);
+    }
   }
 };
 
@@ -231,97 +538,48 @@ template <typename DeviceContext, typename T>
 class GridSampleGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto align_corners = ctx.Attr<bool>("align_corners");
+    auto padding_mode = ctx.Attr<std::string>("padding_mode");
+    auto mode = ctx.Attr<std::string>("mode");
+
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
 
-    const int n = input->dims()[0];
+    const int n = grid->dims()[0];
+    const int out_h = grid->dims()[1];
+    const int out_w = grid->dims()[2];
     const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), input_grad,
         static_cast<T>(0));
     auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-    grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+    grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
     math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), grid_grad,
         static_cast<T>(0));
-
-    Tensor x_w, x_e, y_n, y_s;
-    Tensor d_w, d_e, d_n, d_s;
-    CalcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
-        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
-
-    // gather output grad value to input grad by corner point coords and weight
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_s, d_e,
-                                   d_n);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_n, d_w,
-                                   d_s);
-    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_s, d_w,
-                                   d_n);
-
-    // calc 4 corner points value
-    Tensor v_wn, v_en, v_ws, v_es;
-    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
-    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
-    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
-    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
-    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-    auto v_en_t = EigenTensor<T, 4>::From(v_en);
-    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-    auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-    auto d_w_t = EigenTensor<T, 3>::From(d_w);
-    auto d_e_t = EigenTensor<T, 3>::From(d_e);
-    auto d_n_t = EigenTensor<T, 3>::From(d_n);
-    auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-    auto output_grad_t = EigenTensor<T, 4>::From(*output_grad);
-
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
-    auto grid_grad_x_t = EigenTensor<T, 3>::From(grid_grad_x).setConstant(0.0);
-    auto grid_grad_y_t = EigenTensor<T, 3>::From(grid_grad_y).setConstant(0.0);
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-    const T x_max = static_cast<T>(w - 1);
-    const T y_max = static_cast<T>(h - 1);
-    grid_grad_x_t = grid_grad_x_t * (x_max / (T)2);
-    grid_grad_y_t = grid_grad_y_t * (y_max / (T)2);
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * h * w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    Tensor grid_x, grid_y;
+    Tensor grid_x_scale, grid_y_scale;
+    calcGridLocationsWithGrad<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
+        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
+        &grid_y_scale);
+    if (mode == "bilinear") {
+      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
+                            *input, *output_grad, &grid_x, &grid_y,
+                            &grid_x_scale, &grid_y_scale, input_grad,
+                            grid_grad);
+    } else {
+      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+      grid_x_t = grid_x_t.round();
+      grid_y_t = grid_y_t.round();
+      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
     }
   }
 };
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
index 09c743c4275169ba8c53ccbd428100b2fc4483d6..4ce6856a7eade1b314d8aef1d039424ad42e07cf 100644
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -16,7 +16,9 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72da43e3bc63c1c585fe19d703892c23ce7b0ec2
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_v2_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+class OverflowV2Op : public framework::OperatorWithKernel {
+ public:
+  OverflowV2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
+    UnaryOpUnchangedInferShape(ctx);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    int dtype = -1;
+    auto *x_var = ctx.InputVar("X");
+    if (x_var->IsType<framework::LoDTensor>()) {
+      dtype = x_var->Get<framework::LoDTensor>().type();
+    } else if (x_var->IsType<framework::SelectedRows>()) {
+      dtype = x_var->Get<framework::SelectedRows>().value().type();
+    } else {
+      PADDLE_THROW(plat::errors::InvalidArgument(
+          "Cannot find the input data type by all input data"));
+    }
+    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
+                                   ctx.GetPlace());
+  }
+};
+
+class OverflowV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensors of overflowv2 operator.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of overflowv2 operator. "
+              "Same size compare to input tensor");
+    AddComment(string::Sprintf(R"DOC(
+Overflow %s operator.
+
+$$Out = %s(X)$$
+
+Check whether each element of X is Inf or Nan, return the bool result of each
+element of X as a tensor.
+
+%s
+)DOC",
+                               GetName(), GetComments()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetComments() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_V2OP_MAKER(op_type, comment)                         \
+  namespace paddle {                                                  \
+  namespace operators {                                               \
+  class _##op_type##OverflowV2OpMaker                                 \
+      : public ::paddle::operators::OverflowV2OpMaker {               \
+   protected:                                                         \
+    std::string GetName() const { return #op_type; }                  \
+    std::string GetComments() const { return comment; }               \
+  };                                                                  \
+  }                                                                   \
+  }                                                                   \
+  REGISTER_OPERATOR(                                                  \
+      op_type, ops::OverflowV2Op, ops::_##op_type##OverflowV2OpMaker, \
+      paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
+      paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CPU_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,  \
+                                   ops::functor>,                            \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,       \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,         \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,        \
+                          ops::functor>,                                     \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16, \
+                          ops::functor>);
+
+REGISTER_V2OP_MAKER(isinf_v2, "isinfv2(X)");
+REGISTER_V2OP_MAKER(isnan_v2, "isnanv2(X)");
+REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
+
+REGISTER_OVERFLOW_CPU_KERNEL(isinf_v2, InfinityV2Functor);
+REGISTER_OVERFLOW_CPU_KERNEL(isnan_v2, NANV2Functor);
+REGISTER_OVERFLOW_CPU_KERNEL(isfinite_v2, IsfiniteV2Functor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a6d818d0501e60dfffc8995075bb7f0369788fd
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_v2_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CUDA_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,  \
+                                   ops::functor>,                             \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,       \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,         \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,        \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
+                          ops::functor>);
+
+REGISTER_OVERFLOW_CUDA_KERNEL(isinf_v2, InfinityV2Functor);
+REGISTER_OVERFLOW_CUDA_KERNEL(isnan_v2, NANV2Functor);
+REGISTER_OVERFLOW_CUDA_KERNEL(isfinite_v2, IsfiniteV2Functor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f0aa63ce80248ee9f7839890f611b9d5293789e
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_v2_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+struct InfinityV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsInfV2(tensor, out);
+  }
+};
+
+struct NANV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsNANV2(tensor, out);
+  }
+};
+
+struct IsfiniteV2Functor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorIsfiniteV2(tensor, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 0a7146be83dcb673573f1fdcb94ed2d2c57bd2c3..2c3172d2a1112e2c79a3c1215ccd0d3f08d59451 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -53,11 +53,9 @@ class LinspaceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library_{framework::LibraryType::kPlain};
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Start"),
-        ctx.device_context(), layout_, library_);
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
@@ -73,6 +71,7 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Num",
              "Number of entry in the sequence. It is a tensor of shape [1], "
              "should be of type int32.");
+    AddAttr<int>("dtype", "The output data type.");
     AddOutput("Out", "A sequence of numbers.");
     AddComment(R"DOC(
     Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
@@ -85,4 +84,6 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
 REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
+                       ops::CPULinspaceKernel<int32_t>,
+                       ops::CPULinspaceKernel<int64_t>,
                        ops::CPULinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 47d4536dcfe2a0ab43b3584196a138214e438e3e..8aca892a81d41b1e0a9f7f9c14169c2817ae9452 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -20,13 +20,15 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+__global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
+  CUDA_KERNEL_LOOP(index, size) {
+    out[index] = static_cast<T>(start + step * index);
+  }
 }
 
 template <typename T>
 __global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = start;
+  out[0] = static_cast<T>(start);
 }
 
 template <typename T>
@@ -51,9 +53,9 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     out->Resize(framework::make_ddim({num}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
-    T step = 0;
+    double step = 0;
     if (num != 1) {
-      step = (stop - start) / (num - 1);
+      step = (static_cast<double>(stop - start)) / (num - 1);
     }
 
     auto stream = context.cuda_device_context().stream();
@@ -68,4 +70,6 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
+                        ops::CUDALinspaceKernel<int32_t>,
+                        ops::CUDALinspaceKernel<int64_t>,
                         ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index b1fcac73b0ad249aa19859bde770a8554cdb7408..9fb4960375ed7be60598d558c65310bd4a4b84bc 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -35,14 +35,12 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     if (num > 1) {
-      T step = (stop - start) / (num - 1);
-      T value = start;
+      double step = (static_cast<double>(stop - start)) / (num - 1);
       for (int i = 0; i < num; ++i) {
-        out_data[i] = value;
-        value += step;
+        out_data[i] = static_cast<T>(start + step * i);
       }
     } else {
-      out_data[0] = start;
+      out_data[0] = static_cast<T>(start);
     }
   }
 };
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6e2b3ecff8c83e47a9016cc3d233d1aa03fb52b
--- /dev/null
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/log_softmax_op.h"
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+namespace paddle {
+namespace operators {
+
+class LogSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class LogSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of softmax, "
+             "whose dimension :attr:`axis` is the input_feature_dimensions.");
+    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddAttr<int>("axis",
+                 "The dimension index of Input(x) to perform log_softmax,"
+                 "default -1 for last dimension")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+LogSoftmax Operator.
+
+)DOC");
+  }
+};
+
+class LogSoftmaxOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+    return m;
+  }
+};
+
+class LogSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "log_softmax_grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@grad", "log_softmax_grad");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        platform::errors::InvalidArgument("Input(Out) and its gradients "
+                                          "should have the same shape."));
+
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("log_softmax_grad");
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker,
+                  ops::LogSoftmaxOpInferVarType,
+                  ops::LogSoftmaxGradOpMaker<paddle::framework::OpDesc>,
+                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    log_softmax,
+    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    log_softmax_grad,
+    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..02fca246d241d476b5540a6af8f49b16d4dae416
--- /dev/null
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/log_softmax_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b983ac54157d9d0679ac237ca94e742b38833864
--- /dev/null
+++ b/paddle/fluid/operators/log_softmax_op.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, const framework::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, const framework::DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = static_cast<T>(-64.);
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct LogSoftmaxFunctor {
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y, const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    int axis_dim = X->dims()[axis];
+    const int n = SizeToAxis(axis, X->dims());
+    const int d = SizeFromAxis(axis, X->dims());
+    framework::DDim dim_2d{n, d};
+
+    auto logits = EigenMatrix<T>::From(*X, dim_2d);
+    auto log_softmax = EigenMatrix<T>::From(*Y, dim_2d);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      log_softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .eval()
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      log_softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .eval()
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<T>());
+    }
+
+    log_softmax.device(*context.eigen_device()) =
+        log_softmax -
+        log_softmax.exp()
+            .eval()
+            .reshape(batch_axis_remain)
+            .sum(along_axis)
+            .log()
+            .broadcast(one_axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Output<framework::Tensor>("Out");
+    const int rank = X->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    Out->mutable_data<T>(context.GetPlace());
+
+    LogSoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Out, axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct LogSoftmaxGradFunctor {
+  void operator()(const DeviceContext& context, const framework::Tensor* Y,
+                  const framework::Tensor* dY, framework::Tensor* dX,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int n = SizeToAxis(axis, Y->dims());
+    const int d = SizeFromAxis(axis, Y->dims());
+    framework::DDim dim_2d{n, d};
+
+    auto y = EigenMatrix<T>::From(*Y, dim_2d);
+    auto dy = EigenMatrix<T>::From(*dY, dim_2d);
+    auto dx = EigenMatrix<T>::From(*dX, dim_2d);
+
+    const int axis_dim = Y->dims()[axis];
+    const int batch_size = y.dimension(kBatchDim);
+    const int num_classes = y.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    dx.device(*context.eigen_device()) =
+        dy -
+        (y.exp()) * (dy.reshape(batch_axis_remain)
+                         .sum(along_class)
+                         .broadcast(one_axis));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const int rank = Out->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
+
+    LogSoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Out, dOut, dX, axis);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b44c02757fae9648a7e660a06c03af45d621e02
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/masked_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class MaskedSelectOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
+    framework::DDim output_dims(ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Y", output_dims);
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class MaskedSelectOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Mask",
+             "The mask of Input Tensor to be selected which is a bool Tensor.");
+    AddOutput(
+        "Y",
+        "The returned tensor, the data type "
+        "is same as input, will be on the same device with the input Tensor.");
+    AddComment(R"DOC(
+Size Operator.
+
+Return a new 0-D tensor which indexes the indexed tensor according
+the mask which is a tensor withe data type bool.
+)DOC");
+  }
+};
+
+class MaskedSelectOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Input",
+                   "Input", "MaskedSelect");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Y")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class MaskedSelectGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("masked_select_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Mask", this->Input("Mask"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer,
+                                    "X");
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
+                  ops::MaskedSelectGradOpMaker<paddle::framework::OpDesc>,
+                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
+                  ops::MaskedSelectedGradNoNeedBufferVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    masked_select,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MaskedSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    masked_select_grad,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MaskedSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/masked_select_op.cu b/paddle/fluid/operators/masked_select_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7dc0516800c483d1d82a2390a64130e77b1efb01
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.cu
@@ -0,0 +1,179 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/scan.h>
+#include "paddle/fluid/operators/masked_select_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx])
+      mask_array[idx] = 1;
+    else
+      mask_array[idx] = 0;
+  }
+}
+
+template <typename T>
+__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
+                                     const bool* mask, const T* input, T* out,
+                                     int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[index] = input[idx];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
+                                         const bool* mask, const T* input,
+                                         T* out, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    if (mask[idx]) {
+      int index = mask_prefix_sum[idx];
+      out[idx] = input[index];
+    } else {
+      out[idx] = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MaskedSelectCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto input = ctx.Input<framework::Tensor>("X");
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>("Y");
+    auto* mask_data = mask->data<bool>();
+    auto input_data = input->data<T>();
+
+    auto mask_size = mask->numel();
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim, mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim, mask_dim));
+
+    thrust::device_ptr<const bool> mask_dev_ptr =
+        thrust::device_pointer_cast(mask_data);
+    thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
+    auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
+
+    framework::DDim out_dim{out_size};
+    out->Resize(out_dim);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor mask_array;
+    Tensor mask_prefix_sum;
+    mask_array.Resize(mask_dim);
+    mask_prefix_sum.Resize(mask_dim);
+
+    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
+    int32_t* mask_prefix_sum_data =
+        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
+    int threads = 512;
+    int grid = (mask_size + threads - 1) / threads;
+    auto stream = ctx.cuda_device_context().stream();
+    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
+                                               mask_size);
+
+    thrust::device_ptr<int32_t> mask_array_dev_ptr =
+        thrust::device_pointer_cast(mask_array_data);
+    thrust::device_vector<int32_t> mask_array_vec(
+        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
+    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
+                           mask_array_vec.end(), mask_prefix_sum_data);
+
+    SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaskedSelectGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto input = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* mask_data = mask->data<bool>();
+    auto* input_data = input->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    auto input_size = input->numel();
+    auto mask_size = mask->numel();
+    auto mask_dim = mask->dims();
+
+    auto out_size = mask_size;
+
+    Tensor mask_array;
+    Tensor mask_prefix_sum;
+    mask_array.Resize(mask_dim);
+    mask_prefix_sum.Resize(mask_dim);
+
+    int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
+    int32_t* mask_prefix_sum_data =
+        mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
+    int threads = 512;
+    int grid = (mask_size + threads - 1) / threads;
+    auto stream = ctx.cuda_device_context().stream();
+    SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
+                                               mask_size);
+
+    thrust::device_ptr<int32_t> mask_array_dev_ptr =
+        thrust::device_pointer_cast(mask_array_data);
+    thrust::device_vector<int32_t> mask_array_vec(
+        mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
+    thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
+                           mask_array_vec.end(), mask_prefix_sum_data);
+
+    SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
+        mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    masked_select,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MaskedSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    masked_select_grad,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MaskedSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/masked_select_op.h b/paddle/fluid/operators/masked_select_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8371556c82fe105b6719e845d4fd6232f3a95e
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class MaskedSelectKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto input = context.Input<framework::Tensor>("X");
+    auto mask = context.Input<framework::Tensor>("Mask");
+    auto out = context.Output<framework::Tensor>("Y");
+    auto* mask_data = mask->data<bool>();
+    auto input_data = input->data<T>();
+
+    auto mask_size = mask->numel();
+
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim, mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim, mask_dim));
+
+    int out_size = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) out_size++;
+    }
+
+    framework::DDim out_dim{out_size};
+    out->Resize(out_dim);
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+
+    int index = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) {
+        out_data[index] = input_data[i];
+        index++;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaskedSelectGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto out = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto mask = context.Input<framework::Tensor>("Mask");
+    auto input = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+
+    auto* mask_data = mask->data<bool>();
+    auto* input_data = input->data<T>();
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+    int mask_size = mask->numel();
+
+    int index = 0;
+    for (int i = 0; i < mask_size; i++) {
+      if (mask_data[i]) {
+        out_data[i] = input_data[index];
+        index++;
+      } else {
+        out_data[i] = 0;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index f8c971954fc4c0b367cc6e62df8f7a596b651b94..42a60e9220cf848ba766a19cb7b4d13edc460c11 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -198,6 +198,11 @@ class Blas {
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
                    int batchCount, int64_t strideA, int64_t strideB) const;
 
+  template <typename T>
+  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
+                   int K, T alpha, const T** A, const T** B, T beta, T** C,
+                   int batchCount) const;
+
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB,
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 64b35cfeaecd1f88395db97d0374d919356651eb..d0c5f74d4efb8248b41d8b2af285e8dd7ec4d479 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -458,6 +458,17 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index cdaf53fea30085b34f07c37d50455c9b02dc5c44..892bf15738141bfbb7e75fa6b37c0cda53a8e098 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <vector>
@@ -655,6 +656,26 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
+#ifdef PADDLE_WITH_MKLML
+  const int lda = std::max((transA == CblasNoTrans) ? K : M, 1);
+  const int ldb = std::max((transB == CblasNoTrans) ? N : K, 1);
+  const int ldc = std::max(N, 1);
+  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, A,
+                       &lda, B, &ldb, &beta, C, &ldc, 1 /* group_count */,
+                       &batchCount);
+#else
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+#endif
+}
+
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
 template <>
 template <typename T>
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 44b04104419e790b0ca8619b85ec0a1b4d701021..6748d0ab43f70f997b3008f34f4be743b81e8946 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -73,6 +73,13 @@ struct TensorSetConstantCPU {
   float value_;
 };
 
+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index b967dd2cfda80358b863dfcd986d4ad7104a9ac0..22164131468a46bffc239509a9213a21f1611ed5 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -111,12 +111,11 @@ __global__ void KernelPool2DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
+      phstart = AdaptStartIndex(h_offset, output_height, input_height);
+      phend = AdaptEndIndex(h_offset, output_height, input_height);
+
+      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
+      pwend = AdaptEndIndex(w_offset, output_width, input_width);
     } else {
       phstart = (h_offset < ksize_height)
                     ? 0
@@ -159,6 +158,7 @@ __global__ void KernelPool2DGrad(
           pool_size = exclusive ? (hend - hstart) * (wend - wstart)
                                 : ksize_height * ksize_width;
         }
+
         int output_sub_idx = channel_last
                                  ? (ph * output_width + pw) * channels + offsetC
                                  : ph * output_width + pw;
@@ -689,15 +689,14 @@ __global__ void KernelPool3DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      pdstart = d_offset * output_depth / input_depth;
-      pdend =
-          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = h_offset * output_height / input_height;
-      phend =
-          min((h_offset + 1) * output_height / input_height + 1, output_height);
-      pwstart = w_offset * output_width / input_width;
-      pwend =
-          min((w_offset + 1) * output_width / input_width + 1, output_width);
+      pdstart = AdaptStartIndex(d_offset, output_depth, input_depth);
+      pdend = AdaptEndIndex(d_offset, output_depth, input_depth);
+
+      phstart = AdaptStartIndex(h_offset, output_height, input_height);
+      phend = AdaptEndIndex(h_offset, output_height, input_height);
+
+      pwstart = AdaptStartIndex(w_offset, output_width, input_width);
+      pwend = AdaptEndIndex(w_offset, output_width, input_width);
     } else {
       pdstart = (d_offset < ksize_depth)
                     ? 0
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 238d9f2905058d267ffbee0669594920d7a9e031..86feaa72d5fa69cd5d76e56182c27b8d048e4c74 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <queue>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/generator.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +32,12 @@ UniformSampler::UniformSampler(int64_t range, unsigned int seed)
   dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
 }
 
-int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
+int64_t UniformSampler::Sample() const {
+  return framework::Generator::GetInstance()->is_init_py
+             ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+             : (*dist_)(*random_engine_);
+  // return (*dist_)(*random_engine_);
+}
 
 float UniformSampler::Probability(int64_t value) const { return inv_range_; }
 
@@ -46,8 +52,11 @@ int64_t LogUniformSampler::Sample() const {
   // inverse_transform_sampling method
   // More details:
   // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  const int64_t value =
-      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+  auto cur_random =
+      framework::Generator::GetInstance()->is_init_py
+          ? (*dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+          : (*dist_)(*random_engine_);
+  const int64_t value = static_cast<int64_t>(exp(cur_random * log_range_)) - 1;
   // Mathematically, value should be <= range_, but might not be due to some
   // floating point roundoff, so we mod by range_.
   return value % range_;
@@ -75,8 +84,14 @@ CustomSampler::CustomSampler(int64_t range, const float *probabilities,
 }
 
 int64_t CustomSampler::Sample() const {
-  auto index = (*int_dist_)(*random_engine_);
-  auto p = (*real_dist_)(*random_engine_);
+  auto index =
+      framework::Generator::GetInstance()->is_init_py
+          ? (*int_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+          : (*int_dist_)(*random_engine_);
+  auto p =
+      framework::Generator::GetInstance()->is_init_py
+          ? (*real_dist_)(framework::Generator::GetInstance()->GetCPUEngine())
+          : (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
     int alias = alias_[index];
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 98277df454e09acf82af69d4de5f430ae4f25e57..809164df2056cb4f4856a0b70ea5076351603199 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -535,13 +535,17 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         R"DOC(When MKLDNN MatMul_transpose_reshape fuse activated, "
               "it's a axis atribute of fused transpose for `Out` output.)DOC")
         .SetDefault({});
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
     AddAttr<float>("Scale_x",
                    "(float, default 1.0f), The quantize scale of X tensor")
         .SetDefault(1.0f);
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0254ad0a563d91282e76cd7bf43343e4d9139842
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -0,0 +1,176 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class MatMulV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
+    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    std::vector<int64_t> dims_x =
+        paddle::framework::vectorize(ctx->GetInputDim("X"));
+    std::vector<int64_t> dims_y =
+        paddle::framework::vectorize(ctx->GetInputDim("Y"));
+    auto ndims_x = dims_x.size();
+    auto ndims_y = dims_y.size();
+
+    bool x_broadcasted = false, y_broadcasted = false;
+    if (ndims_x == 1) {
+      dims_x.insert(dims_x.begin(), 1);
+      ndims_x = 2;
+      x_broadcasted = true;
+    }
+
+    if (ndims_y == 1) {
+      dims_y.push_back(1);
+      ndims_y = 2;
+      y_broadcasted = true;
+    }
+
+    size_t M, N;
+    if (trans_x) {
+      M = dims_x[ndims_x - 1];
+    } else {
+      M = dims_x[ndims_x - 2];
+    }
+    if (trans_y) {
+      N = dims_y[ndims_y - 2];
+    } else {
+      N = dims_y[ndims_y - 1];
+    }
+
+    std::vector<int64_t> new_dims;
+    if (ndims_x >= ndims_y) {
+      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+    } else {
+      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+    }
+    if (!x_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!y_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (x_broadcasted && y_broadcasted) {
+      new_dims.push_back(1);
+    }
+
+    auto out_dims = framework::make_ddim(new_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "tensor of shape (d0, d1 ... M, K)");
+    AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
+    AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
+    AddAttr<bool>("trans_x",
+                  "Set true to transpose the last two dimensions of X before "
+                  "doing multiplication")
+        .SetDefault(false);
+    AddAttr<bool>("trans_y",
+                  "Set true to transpose the last two dimensions of Y before "
+                  "doing multiplication")
+        .SetDefault(false);
+    AddComment(
+        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), 
+        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). 
+        In addition, it also follows the broadcast rule which is similar as
+        numpy.matmul.
+)DOC");
+  }
+};
+
+class MatMulV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul_v2");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul_v2");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "matmul_v2");
+    auto x_dims = context->GetInputDim("X");
+    auto y_dims = context->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(y_grad_name)) {
+      context->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+template <typename T>
+class MatMulV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("matmul_v2_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
+                  ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2, ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2_grad,
+    ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64ec65a23419725c7cc481beadb9383402a426bd
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+namespace ops = paddle::operators;
+namespace plf = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(matmul_v2,
+                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc83e4d964815ec46452bb0086cf17437b3846a4
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -0,0 +1,481 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/dot_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+
+#ifdef __NVCC__
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+};
+
+template <typename DeviceContext, typename T>
+void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
+                            const std::vector<int>& reduce_dims,
+                            const paddle::framework::ExecutionContext& ctx) {
+  if (reduce_dims.empty()) {
+    // FIXME maybe reduce this copy operation
+    framework::TensorCopySync(*input, ctx.GetPlace(), output);
+    return;
+  }
+#ifdef __NVCC__
+  auto stream = ctx.cuda_device_context().stream();
+  TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+      *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+      IdentityFunctor<T>(), stream);
+#else
+  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
+      input, output, reduce_dims, true, false, ctx)
+      .template apply<T>();
+#endif
+}
+
+static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
+                                 const int y_ndim, const std::int64_t* y_dims,
+                                 std::int64_t* x_bd_dims,
+                                 std::int64_t* y_bd_dims,
+                                 std::int64_t* out_bd_dims) {
+  const int ndim = std::max(x_ndim, y_ndim);
+  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
+  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
+  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
+  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    PADDLE_ENFORCE_EQ(
+        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
+        true, platform::errors::InvalidArgument(
+                  "Input(X) and Input(Y) has error dim."));
+    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
+      out_bd_dims[i] = 0;
+    } else {
+      out_bd_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
+    }
+  }
+}
+
+static int64_t GetIndexMessage(const int n, const int64_t* dims,
+                               const int64_t* index) {
+  int64_t sum = 0;
+  for (int i = 0; i < n; ++i) {
+    if (dims[i] > 1) {
+      sum = sum * dims[i] + index[i];
+    }
+  }
+  return sum;
+}
+
+static void IndexIncreaseFromDims(const int ndim, const int64_t* dims,
+                                  int64_t* index) {
+  for (int i = ndim - 1; i >= 0; --i) {
+    ++index[i];
+    if (index[i] >= dims[i]) {
+      index[i] -= dims[i];
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const Tensor* X, const Tensor* Y,
+                    const std::vector<std::int64_t>& x_dims,
+                    const std::vector<std::int64_t>& y_dims, Tensor* Out,
+                    bool trans_x, bool trans_y,
+                    const paddle::framework::ExecutionContext& ctx) {
+  const int x_ndim = x_dims.size();
+  const int y_ndim = y_dims.size();
+
+  // get data ptr
+  const T* x_data = X->data<T>();
+  const T* y_data = Y->data<T>();
+
+  if (x_ndim == 1 && y_ndim == 1) {
+    PADDLE_ENFORCE_EQ(X->numel(), Y->numel(),
+                      platform::errors::InvalidArgument(
+                          "X's numbers is not equal to Y's numbers,"
+                          "when X/Y's dims =1"));
+    VLOG(3) << "MatMul's case 1";
+    Out->Resize({1});
+    Out->mutable_data<T>(ctx.GetPlace());
+    auto out_eigen = framework::EigenScalar<T>::From(*Out);
+    auto x_eigen = framework::EigenVector<T>::Flatten(*X);
+    auto y_eigen = framework::EigenVector<T>::Flatten(*Y);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out_eigen.device(dev) = (x_eigen * y_eigen).sum();
+    return;
+  }
+
+  auto& dev_ctx = ctx.template device_context<DeviceContext>();
+  auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+  if (x_ndim == 1) {
+    const int N = X->numel();
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(y_ndim - 1);
+    if (trans_y) {
+      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
+    } else {
+      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
+      out_dims.back() = y_dims.back();
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+    if (trans_y) {
+      const int M = Y->numel() / N;
+      VLOG(3) << "MatMul's case 2";
+      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+    } else {
+      const int M = y_dims[y_ndim - 1];
+      const int batch_size = Y->numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 3";
+        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 4";
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
+                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+      }
+    }
+    return;
+  }
+
+  if (y_ndim == 1) {
+    const int N = Y->numel();
+    if (trans_x) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(x_ndim - 1);
+    if (trans_x) {
+      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
+      out_dims.back() = x_dims.back();
+    } else {
+      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+
+    if (trans_x) {
+      const int M = x_dims[x_ndim - 1];
+      const int batch_size = X->numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 5";
+        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 6";
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
+                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+      }
+    } else {
+      const int M = X->numel() / N;
+      VLOG(3) << "MatMul's case 7";
+      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+    }
+    return;
+  }
+
+  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
+  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+  if (trans_y) {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  } else {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  }
+  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
+  const int ndim = std::max(x_ndim, y_ndim);
+  std::vector<std::int64_t> x_broadcast_dims(ndim);
+  std::vector<std::int64_t> y_broadcast_dims(ndim);
+  std::vector<std::int64_t> out_broadcast_dims(ndim);
+
+  GetBroadcastFromDims(x_ndim - 2, x_dims.data(), y_ndim - 2, y_dims.data(),
+                       x_broadcast_dims.data(), y_broadcast_dims.data(),
+                       out_broadcast_dims.data());
+
+  out_broadcast_dims[ndim - 2] = M;
+  out_broadcast_dims[ndim - 1] = N;
+
+  Out->Resize(framework::make_ddim(out_broadcast_dims));
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  const int batch_dim = ndim - 2;
+  // broadcast message
+  const bool is_broadcast_dims = !std::equal(
+      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim,
+      y_broadcast_dims.cbegin());
+
+  const std::int64_t x_batch_size = std::accumulate(
+      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  const std::int64_t y_batch_size = std::accumulate(
+      y_broadcast_dims.cbegin(), y_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  const std::int64_t out_batch_size = std::accumulate(
+      out_broadcast_dims.cbegin(), out_broadcast_dims.cbegin() + batch_dim, 1LL,
+      std::multiplies<std::int64_t>());
+  if (out_batch_size == 0) return;
+  if (x_batch_size == 1 && y_batch_size == 1) {
+    VLOG(3) << "MatMul's case 8";
+    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+              y_data, 0.0f, Out->data<T>());
+  } else if (x_batch_size == 1) {
+    if (M == 1 && trans_y) {
+      VLOG(3) << "MatMul's case 9";
+      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
+                Out->data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 10";
+      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
+                       K * N);
+    }
+  } else if (y_batch_size == 1) {
+    if (!trans_x) {
+      VLOG(3) << "MatMul's case 11";
+      blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
+                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
+                Out->data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 12";
+      blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
+                       M * K, 0);
+    }
+  } else if (!is_broadcast_dims) {
+    VLOG(3) << "MatMul's case 13";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+  } else {
+    // in the case, can't use stridedgemm
+    std::vector<const T*> x_ptr(out_batch_size);
+    std::vector<const T*> y_ptr(out_batch_size);
+    std::vector<T*> out_ptr(out_batch_size);
+    std::vector<std::int64_t> index(batch_dim, 0);
+    for (std::int64_t i = 0; i < out_batch_size; ++i) {
+      // using the index to get offset
+      const std::int64_t x_index =
+          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
+      const std::int64_t y_index =
+          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
+
+      x_ptr[i] = x_data + x_index * M * K;
+      y_ptr[i] = y_data + y_index * K * N;
+      out_ptr[i] = Out->data<T>() + i * M * N;
+      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
+    }
+    VLOG(3) << "MatMul's case 14";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
+                     out_batch_size);
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const Tensor* X, const Tensor* Y, Tensor* Out, bool trans_x,
+                    bool trans_y,
+                    const paddle::framework::ExecutionContext& ctx) {
+  const std::vector<std::int64_t> x_dims = vectorize(X->dims());
+  const std::vector<std::int64_t> y_dims = vectorize(Y->dims());
+  MatMulFunction<DeviceContext, T>(X, Y, x_dims, y_dims, Out, trans_x, trans_y,
+                                   ctx);
+}
+
+template <typename DeviceContext, typename T>
+class MatMulV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* Out = ctx.Output<Tensor>("Out");
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+    MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    // get dims
+    std::vector<std::int64_t> x_dims = vectorize(X->dims());
+    std::vector<std::int64_t> y_dims = vectorize(Y->dims());
+    std::vector<std::int64_t> dout_dims = vectorize(dOut->dims());
+
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int ndim = dout_dims.size();
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    // x's or y's dim = 1
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dx) dx->mutable_data<T>(ctx.GetPlace());
+      if (dy) dy->mutable_data<T>(ctx.GetPlace());
+      if (dOut->numel() == 1) {
+        DotGradFunction<DeviceContext, T>(X, Y, dOut, dx, dy, ctx);
+        return;
+      }
+    }
+    // It is very tricky. For this broadcast, currently using the reduce sum to
+    // get gradient.
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin() + 0, 1);
+      x_ndim += 1;
+      if (trans_x)
+        dout_dims.push_back(1);
+      else
+        dout_dims.insert(dout_dims.begin() + ndim - 1, 1);
+      ndim += 1;
+    }
+
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      y_ndim += 1;
+      if (trans_y)
+        dout_dims.insert(dout_dims.begin() + ndim - 1, 1);
+      else
+        dout_dims.push_back(1);
+      ndim += 1;
+    }
+
+    // the normal case
+    Tensor dx_help, dy_help;
+    if (trans_x) {
+      if (trans_y) {
+        // X'Y': dA = Y'G', dB = G'X'
+        if (dx)
+          MatMulFunction<DeviceContext, T>(Y, dOut, y_dims, dout_dims, &dx_help,
+                                           true, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(dOut, X, dout_dims, x_dims, &dy_help,
+                                           true, true, ctx);
+      } else {
+        // X'Y: dX = YG', dY = XG
+        if (dx)
+          MatMulFunction<DeviceContext, T>(Y, dOut, y_dims, dout_dims, &dx_help,
+                                           false, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(X, dOut, x_dims, dout_dims, &dy_help,
+                                           false, false, ctx);
+      }
+    } else {
+      if (trans_y) {
+        // XY': dX = GY, dY = G'X
+        if (dx)
+          MatMulFunction<DeviceContext, T>(dOut, Y, dout_dims, y_dims, &dx_help,
+                                           false, false, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(dOut, X, dout_dims, x_dims, &dy_help,
+                                           true, false, ctx);
+      } else {
+        // XY: dX = GY', dY = X'G
+        if (dx)
+          MatMulFunction<DeviceContext, T>(dOut, Y, dout_dims, y_dims, &dx_help,
+                                           false, true, ctx);
+        if (dy)
+          MatMulFunction<DeviceContext, T>(X, dOut, x_dims, dout_dims, &dy_help,
+                                           true, false, ctx);
+      }
+    }
+    // get help dims
+    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(dx_broadcast_dims.data(),
+              dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::fill(dy_broadcast_dims.data(),
+              dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(x_dims.data(), x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+    std::copy(y_dims.data(), y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // reduce sum to get grad by ReduceSum
+    if (dx) {
+      dx->Resize(dx_help.dims());
+      ReduceSumForMatmulGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
+                                               ctx);
+      dx->Resize(X->dims());
+    }
+    if (dy) {
+      dy->Resize(dy_help.dims());
+      ReduceSumForMatmulGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
+                                               ctx);
+      dy->Resize(Y->dims());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index aa9606b5f85896cf4905c53b655f894e6429fc9a..5ca9216d0c8d6b3f773a1eb1a0cec216ca6ed4f3 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -76,6 +76,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   // paddle uses beta but mkldnn uses alpha for swish
   if (algorithm == mkldnn::algorithm::eltwise_swish) {
     std::swap(alpha, beta);
+  } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
+    alpha = ctx.Attr<T>("threshold");
   }
 
   PADDLE_ENFORCE(
@@ -119,6 +121,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   // paddle uses beta but mkldnn uses alpha for swish
   if (algorithm == mkldnn::algorithm::eltwise_swish) {
     std::swap(alpha, beta);
+  } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
+    alpha = ctx.Attr<T>("threshold");
   }
 
   auto diff_dst_tz = framework::vectorize<int64_t>(diff_y->dims());
@@ -192,6 +196,10 @@ template <typename T>
 using ReluMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
 
+template <typename T>
+using Relu6MKLDNNFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_bounded_relu>;
+
 template <typename T>
 using SwishMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_swish>;
@@ -216,6 +224,10 @@ template <typename T>
 using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
 
+template <typename T>
+using Relu6MKLDNNGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_bounded_relu>;
+
 template <typename T>
 using SwishMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_swish>;
@@ -249,6 +261,7 @@ namespace ops = paddle::operators;
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                     \
   __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);          \
+  __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor);       \
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);    \
   __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor);          \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);       \
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 17e1e1958346155af32cf75b5e9fc25cdbdd91eb..7d99bb7d2b7a7049c67788df4c507afc14880815 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -72,7 +72,7 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
   return dst_dt;
 }
 
-template <typename T>
+template <typename T, typename K, typename T_out>
 class ConvMKLDNNHandlerT
     : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> {
  public:
@@ -227,7 +227,7 @@ class ConvMKLDNNHandlerT
           platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
                                   MKLDNNMemoryFormat::any);
       const auto dst_md = platform::MKLDNNMemDesc(
-          dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+          dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
 
       const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
                                          : mkldnn::prop_kind::forward_training;
@@ -313,29 +313,29 @@ class ConvMKLDNNHandlerT
     if (is_test && weights_mem_p) {
       return weights_mem_p;
     } else {
-      const T* filter_data = filter->data<T>();
+      const K* filter_data = filter->data<K>();
       auto weights_tz = framework::vectorize(filter->dims());
       GetWeightsTz(weights_tz, groups);
 
       auto user_src_md = platform::MKLDNNMemDesc(
-          weights_tz, platform::MKLDNNGetDataType<T>(),
+          weights_tz, platform::MKLDNNGetDataType<K>(),
           GetWeightsFormat(filter->format(), groups, is_conv3d));
 
       return this->AcquireMemoryWithReorder(
           user_src_md, this->fwd_pd_->weights_desc(),
-          to_void_cast<T>(filter_data), "@weights_mem_p", is_test);
+          to_void_cast<K>(filter_data), "@weights_mem_p", is_test);
     }
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
       const framework::Tensor* bias, const bool is_test) {
-    const T* bias_data = bias->data<T>();
+    const K* bias_data = bias->data<K>();
     auto user_bias_md = platform::MKLDNNMemDesc(
-        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<T>(),
+        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
         MKLDNNMemoryFormat::x);
 
     return this->AcquireMemoryWithReorder(
-        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<T>(bias_data),
+        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<K>(bias_data),
         "@bias_mem_p", is_test);
   }
 
@@ -358,14 +358,14 @@ class ConvMKLDNNHandlerT
     if (residual_param->format() !=
         platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
       auto residual_memory_p = this->AcquireResidualMemory(residual_param);
-      dst_memory_p = this->AcquireDstMemory(output);
+      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
       this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
     } else {
       // Changing ShareDataWith to TensorCopy results in performance drop
       // on ResNet architectures
       // (https://github.com/PaddlePaddle/Paddle/issues/22964)
       output->ShareDataWith(*residual_param);
-      dst_memory_p = this->AcquireDstMemory(output);
+      dst_memory_p = this->template AcquireDstMemory<T_out>(output);
     }
     return dst_memory_p;
   }
@@ -381,7 +381,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
     if (!is_INT8) {
-      ComputeFP32(ctx);
+      ComputeFP32<float>(ctx);
     } else {
       std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
       bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
@@ -399,6 +399,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
   }
 
+  template <typename T_out>
   void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
@@ -414,7 +415,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    ConvMKLDNNHandlerT<T> handler(
+    ConvMKLDNNHandlerT<T, K, T_out> handler(
         ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
         output, ctx.InputName("Input") + ctx.InputName("Filter"));
 
@@ -429,7 +430,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       dst_memory_p =
           handler.AcquireDstMemoryWithResidual(output, residual_param);
     } else {
-      dst_memory_p = handler.AcquireDstMemory(output);
+      dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
     }
 
     auto conv_p = handler.AcquireForwardPrimitive();
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 86c1c3232644a1fed236563a65a16bc2f6466d49..540642c7140e707441ad9c4d71ae9b777863a7bd 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -51,11 +51,11 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
     MKLDNNMemoryFormat src_fmt = input->format();
-    std::string key =
-        platform::CreateKey(src_dt, src_tz, ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key = platform::CreateKey(platform::ThreadIDasStr(), src_dt,
+                                          src_tz, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 37b6e3bb803a2b68cec54059b266bd7585ff9958..d0ecca78ae8b27451bc51a3c1561609fc470a9f8 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include "paddle/fluid/operators/mean_op.h"
 
@@ -28,21 +29,29 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::normal_distribution<T> dist(mean, std);
-
     const std::string op_type = "gaussian_random";
     auto shape = GetShape(context, op_type);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+    std::normal_distribution<T> dist(mean, std);
+
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(gen_engine);
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
     }
 
     tensor->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 55bd683f8f4283287e1bd67810170bd4082379a6..29a86a35d7b26f41745907fb6bacf30506c027a0 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,11 +48,12 @@ class QuantOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
 
     bool is_negative = ctx.Attr<bool>("is_negative_input");
-    std::string key = platform::CreateKey(src_tz, scale_data, is_negative,
-                                          ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key =
+        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
+                            is_negative, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 92e7744e3c0a459f3267f4210d42752b5ec0bcc0..5ad5ad9450503111882a9b3bc2cd9161f74d500e 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -40,11 +40,12 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
 
     auto src_tz = paddle::framework::vectorize(input->dims());
 
-    std::string key = platform::CreateKey(src_tz, scale_in, scale_out,
-                                          ctx.OutputName("Output"));
-    const std::string key_prim = key + "@reorder_p";
-    const std::string key_src_mem = key + "@src_mem";
-    const std::string key_dst_mem = key + "@dst_mem";
+    std::string key =
+        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_in,
+                            scale_out, ctx.OutputName("Output"));
+    const std::string key_prim = key + "@r";
+    const std::string key_src_mem = key + "@s";
+    const std::string key_dst_mem = key + "@d";
 
     std::shared_ptr<dnnl::memory> src_memory;
     std::shared_ptr<dnnl::memory> dst_memory;
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index e99ccd31714787306358d9b19b31a62ff21d5dab..f0b5f4a466a0049c53d51d8610cf115d8bfe0295 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -55,8 +55,8 @@ class NLLLossOp : public framework::OperatorWithKernel {
                               "Input(Weight) should be a 1D tensor."));
         PADDLE_ENFORCE_EQ(x_dims[1], w_dims[0],
                           platform::errors::InvalidArgument(
-                              "Input(Weight) Tensor's size should match"
-                              "to the class numer."));
+                              "Input(Weight) Tensor's size should match "
+                              "to the the total number of classes."));
       }
     }
     if (x_dims.size() == 2) {
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index 3d618805f02aa9b6d5310bfc8a79857f522f8ac5..531c175e03e5eee3eba609c322944b1398253726 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -44,6 +44,8 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
       out_data[i] = 0;
       continue;
     }
+    PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                   "label should not be out of bounds.");
     const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
     out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
   }
@@ -62,6 +64,8 @@ __global__ void GPUNLLLossForward1D_with_reduce(
   for (i = threadIdx.x; i < batch_size; i += NTHREADS) {
     const auto cur_label = label_data[i];
     if (cur_label != ignore_index) {
+      PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                     "label should not be out of bounds.");
       const auto cur_weight = weight_data ? weight_data[cur_label] : (T)1;
       sharedInputs[threadIdx.x] -=
           x_data[i * n_classes + cur_label] * cur_weight;
@@ -198,6 +202,8 @@ __global__ void GPUNLLLossForward2D_no_reduce(
       out_data[index] = 0;
       continue;
     }
+    PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                   "label should not be out of bounds.");
     const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
     out_data[index] =
         -x_data[b * sample_size + cur_label * map_size + h * in_dim3 + w] *
@@ -226,6 +232,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
        i < map_nelem; i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
+      PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
+                     "label should not be out of bounds.");
       const T cur_weight = weight_data ? weight_data[cur_label] : (T)1;
       input_sum -= x_data[ioffset + i + map_nelem * cur_label] * cur_weight;
       acc_weight += cur_weight;
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
index 92f3d169f3f6a3be1009d84ebd87c82691eb9f0c..e93d5792205900635093e5f18d715e4607f73cda 100644
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
@@ -91,7 +91,7 @@ static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
           }
           PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                             platform::errors::InvalidArgument(
-                                "label should nor be out of bounds."));
+                                "label should not be out of bounds."));
           const auto cur_weight =
               weight_data ? weight_data[cur_label] : static_cast<T>(1);
           out_data[index] = -x_data[i * sample_size + cur_label * map_size +
@@ -117,7 +117,7 @@ static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
         }
         PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                           platform::errors::InvalidArgument(
-                              "label should nor be out of bounds."));
+                              "label should not be out of bounds."));
         const auto cur_weight =
             weight_data ? weight_data[cur_label] : static_cast<T>(1);
         total_weight_val += cur_weight;
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 057a7a38e3f40fdeb400418740dab825f532054c..aa39821051eed11c3aa02c4baabfef539d7d7692 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -25,34 +25,49 @@ class PnormOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor) A tensor of rank >= axis.");
     AddAttr<float>("porder",
-                   "The porder is the p order vector norm to calculate.")
+                   "(float, default 2) The porder is the p order vector norm "
+                   "to calculate. Available for porder=0, inf, -inf and any "
+                   "real number.")
         .SetDefault(2.0f);
     AddAttr<int>("axis",
-                 "The axis on which to apply normalization. If axis < 0, "
+                 "The axis on which to apply norm operation. If axis < 0, "
                  "the dimension to pnorm is rank(X) + axis. -1 is "
                  "the last dimension.")
         .SetDefault(-1);
     AddAttr<float>("epsilon",
-                   "(float, default 1e-10) The epsilon value is used "
+                   "(float, default 1e-12) The epsilon value is used "
                    "to avoid division by zero.")
         .SetDefault(1.0e-12f);
     AddAttr<bool>(
         "keepdim",
-        "(bool, default false) Whether to keep the dimensions as the input")
+        "(bool, default false) Whether to keep the dimensions as the input.")
         .SetDefault(false);
-    AddOutput(
-        "Out",
-        "(Tensor) Output tensor for the `(sum(x.pow(p)) + epsion).pow(1/p)`");
+    AddOutput("Out", "(Tensor) Output result tensor of p-norm");
     AddComment(R"DOC(
+Pnorm Operator.
+Given a tensor X, compute Lp-norm of X.
 
-Given a tensor, apply 2-normalization along the provided axis.
+When p = 0, defining $0^0 = 0$, the zero-norm of X is simply the number of non-zero elements of X.
+$$
+||X||_{0} = \lim_{p \rightarrow 0} \sum_i |x_i|^p
+$$
+
+When p = inf, the inf-norm of X is the maximum element of X.
+$$
+||X||_\infty = \max_i |x_i|
+$$
+
+When p = -inf, the negative-inf-norm of X is the minimum element of X.
+$$
+||X||_{-\infty} = \min_i |x_i|
+$$
 
+Otherwise, the p-norm of X follows the formula,
 $$
-pnorm = \(\sum_i {abs\(x_i\)^p}  \)^{1/p}
+||X||_{p} = (\sum_i |x_i|^p)^{1/p}
 $$
+where, $\sum_i $ is calculated along the `axis` dimension.
 
-where, $\sum_i{x_i^p}$ is calculated along the `axis` dimension.
-        
 )DOC");
   }
 };
@@ -63,31 +78,33 @@ class PnormOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "p_norm");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "p_norm");
-    auto porder = ctx->Attrs().Get<float>("porder");
-    PADDLE_ENFORCE_NE(porder, INFINITY,
-                      platform::errors::Unimplemented(
-                          "The input porder of p_norm is not support for "
-                          "porder == 0, INFINITY, -INFINITY now."));
-    PADDLE_ENFORCE_NE(porder, -INFINITY,
-                      platform::errors::Unimplemented(
-                          "The input porder of p_norm is not support for "
-                          "porder == 0, INFINITY, -INFINITY now."));
-    PADDLE_ENFORCE_GT(porder, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The input porder of p_norm is not support for "
-                          "porder <= 0, But received porder=%f.",
-                          porder));
-    auto xdim = ctx->GetInputDim("X");
+    auto x_dim = ctx->GetInputDim("X");
+    auto x_rank = x_dim.size();
     int axis = ctx->Attrs().Get<int>("axis");
     bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    if (axis < 0) axis = xdim.size() + axis;
+
+    PADDLE_ENFORCE_GE(axis, -x_rank,
+                      platform::errors::InvalidArgument(
+                          "Attr(axis) value should be in range [-R, R-1], R is "
+                          "the rank of Input(X). But received axis: %d, R: %d. "
+                          "Current Input(X)'s shape is=[%s].",
+                          axis, x_rank, x_dim));
+    PADDLE_ENFORCE_LT(axis, x_rank,
+                      platform::errors::InvalidArgument(
+                          "Attr(axis) value should be in range [-R, R-1], R is "
+                          "the rank of Input(X). But received axis: %d, R: %d. "
+                          "Current Input(X)'s shape is=[%s].",
+                          axis, x_rank, x_dim));
+
+    if (axis < 0) axis = x_dim.size() + axis;
     std::vector<int> reduce_dims;
-    for (int i = 0; i < xdim.size(); ++i) {
-      if (i != axis) reduce_dims.emplace_back(xdim[i]);
+    for (int i = 0; i < x_dim.size(); ++i) {
+      if (i != axis) reduce_dims.emplace_back(x_dim[i]);
     }
-    xdim[axis] = 1;
+    x_dim[axis] = 1;
+
     if (keepdim) {
-      ctx->SetOutputDim("Out", xdim);
+      ctx->SetOutputDim("Out", x_dim);
     } else {
       ctx->SetOutputDim("Out", framework::make_ddim(reduce_dims));
     }
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index d9ac98ff880bcf42e0af5bb75b080464c5211671..63f2a1c56c12522bc8a029e392ff02f5a28b45df 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -49,20 +49,70 @@ __global__ void Pnorm(const T* x, const int pre,
 
   for (int i = blockIdx.x; i < num; i += gridDim.x) {
     int base = (i / post) * post * axis_n + (i % post);
-
     T sum = 0.0;
-    __shared__ T norm;
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
       const T x_ij = x[base + j * post];
       sum += inline_pow(inline_abs(x_ij), porder_t);
     }
     T reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = inline_pow(reduce_result, porder_inv);
+  }
+}
 
-    if (threadIdx.x == 0) {
-      norm = inline_pow(reduce_result, porder_inv);
-      out_norm[i] = norm;
+template <typename T, int BlockDim>
+__global__ void ZeorNorm(const T* x, const int pre,
+                         const int axis_n,  // dim in axis
+                         const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T sum = 0.0;
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      const T x_ij = x[base + j * post];
+      sum += static_cast<T>(x_ij != 0);
     }
-    __syncthreads();
+    T reduce_result = BlockReduce(temp_storage).Sum(sum);
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void InfNorm(const T* x, const int pre,
+                        const int axis_n,  // dim in axis
+                        const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T cur_max = inline_abs(x[base]);
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      T x_ij_abs = inline_abs(x[base + j * post]);
+      if (cur_max < x_ij_abs) cur_max = x_ij_abs;
+    }
+    T reduce_result = BlockReduce(temp_storage).Reduce(cur_max, cub::Max());
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void NegInfNorm(const T* x, const int pre,
+                           const int axis_n,  // dim in axis
+                           const int post, T* out_norm) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    int base = (i / post) * post * axis_n + (i % post);
+    T cur_min = inline_abs(x[base]);
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      T x_ij_abs = inline_abs(x[base + j * post]);
+      if (cur_min > x_ij_abs) cur_min = x_ij_abs;
+    }
+    T reduce_result = BlockReduce(temp_storage).Reduce(cur_min, cub::Min());
+    if (threadIdx.x == 0) out_norm[i] = reduce_result;
   }
 }
 
@@ -89,8 +139,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
-    Pnorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
-                                                          porder, norm);
+    if (porder == 0) {
+      ZeorNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                               norm);
+    } else if (porder == INFINITY) {
+      InfNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                              norm);
+    } else if (porder == -INFINITY) {
+      NegInfNorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n,
+                                                                 post, norm);
+    } else {
+      Pnorm<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
+                                                            porder, norm);
+    }
   }
 };
 
@@ -112,7 +173,6 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
       pnorm_i = x_norm[i];
       yout_i = y_grad[i];
     }
-
     __syncthreads();
 
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
@@ -125,6 +185,33 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
   }
 }
 
+template <typename T, int BlockDim>
+__global__ void InfNormGradient(const T* x, const T* x_norm, const T* y_grad,
+                                const int pre, const int axis_n, const int post,
+                                T* x_grad) {
+  int num = pre * post;
+  for (int i = blockIdx.x; i < num; i += gridDim.x) {
+    __shared__ T pnorm_i;
+    __shared__ T yout_i;
+    auto base = (i / post) * post * axis_n + (i % post);
+    if (threadIdx.x == 0) {
+      pnorm_i = x_norm[i];
+      yout_i = y_grad[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
+      int index = base + j * post;
+      const T x_ij = inline_abs(x[index]);
+      if (x_ij == pnorm_i) {
+        x_grad[index] = inline_sign(x[index]) * yout_i;
+      } else {
+        x_grad[index] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename AttrType = T>
 class PnormGradCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -153,8 +240,17 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
-    PnormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x, x_norm, norm_dy, porder, pre, n, post, eps, dx);
+    if (porder == 0) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, out_dx, static_cast<T>(0));
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      InfNormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+          x, x_norm, norm_dy, pre, n, post, dx);
+    } else {
+      PnormGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+          x, x_norm, norm_dy, porder, pre, n, post, eps, dx);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/p_norm_op.h b/paddle/fluid/operators/p_norm_op.h
index c5bdfe352723b55f80376d6644922af5de099e90..7620d1421e897f1a62ddf3a6c6e725e5a0f38bf0 100644
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
@@ -58,10 +58,20 @@ class PnormKernel : public framework::OpKernel<T> {
     auto x = x_e.reshape(shape);
     auto norm = norm_e.reshape(norm_shape);
 
+    // p=0 means number of non-zero elements of (x)
+    // p=inf means the maximum of |x|
+    // p=-inf means the minimum of |x|
+    // otherwise, Lp-norm = pow(sum(pow(|x|, p)), 1/p)
     Eigen::DSizes<int, 1> rdim(1);
-    auto xp = (x.abs()).pow(porder);
-    auto sum = xp.sum(rdim);
-    norm.device(*place) = sum.pow(1.0f / porder);
+    if (porder == 0) {
+      norm.device(*place) = (x != x.constant(0)).template cast<T>().sum(rdim);
+    } else if (porder == INFINITY) {
+      norm.device(*place) = x.abs().maximum(rdim);
+    } else if (porder == -INFINITY) {
+      norm.device(*place) = x.abs().minimum(rdim);
+    } else {
+      norm.device(*place) = x.abs().pow(porder).sum(rdim).pow(1.0f / porder);
+    }
   }
 };
 
@@ -102,10 +112,20 @@ class PnormGradKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 1> rdim(1);
     Eigen::DSizes<int, 3> bcast(1, n, 1);
 
-    dx.device(*place) = (x.abs()).pow(porder - 1.0f);
-    dx.device(*place) =
-        dx / ((norm.broadcast(bcast)).pow(porder - 1.0f) + x.constant(eps));
-    dx.device(*place) = dx * norm_dy.broadcast(bcast) * x.sign();
+    if (porder == 0) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, out_dx, static_cast<T>(0));
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      dx.device(*place) =
+          (x.abs() == norm.broadcast(bcast)).template cast<T>() * x.sign() *
+          norm_dy.broadcast(bcast);
+    } else {
+      dx.device(*place) =
+          (x.abs()).pow(porder - 1.0f) /
+          ((norm.broadcast(bcast)).pow(porder - 1.0f) + x.constant(eps));
+      dx.device(*place) = dx * norm_dy.broadcast(bcast) * x.sign();
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d41b823b6551647803ae5641f72955dbbc1eb62
--- /dev/null
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -0,0 +1,912 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                         const int in_height, const int in_width,
+                         const int out_depth, const int out_height,
+                         const int out_width, const int pad_front,
+                         const int pad_top, const int pad_left, const int out_d,
+                         const int out_h, const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+       in_h >= in_height || in_w >= in_width)
+          ? value
+          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                         const int in_depth, const int in_height,
+                         const int in_width, const int out_depth,
+                         const int out_height, const int out_width,
+                         const int pad_front, const int pad_top,
+                         const int pad_left, const int out_d, const int out_h,
+                         const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+      in_h >= in_height || in_w >= in_width) {
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = value;
+    }
+  } else {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = in_data[in_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                           const int in_height, const int in_width,
+                           const int out_depth, const int out_height,
+                           const int out_width, const int pad_front,
+                           const int pad_top, const int pad_left,
+                           const int out_d, const int out_h, const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                           const int in_depth, const int in_height,
+                           const int in_width, const int out_depth,
+                           const int out_height, const int out_width,
+                           const int pad_front, const int pad_top,
+                           const int pad_left, const int out_d, const int out_h,
+                           const int out_w, const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                             const int in_height, const int in_width,
+                             const int out_depth, const int out_height,
+                             const int out_width, const int pad_front,
+                             const int pad_top, const int pad_left,
+                             const int out_d, const int out_h, const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                             const int in_depth, const int in_height,
+                             const int in_width, const int out_depth,
+                             const int out_height, const int out_width,
+                             const int pad_front, const int pad_top,
+                             const int pad_left, const int out_d,
+                             const int out_h, const int out_w, const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
+                            const int in_height, const int in_width,
+                            const int out_depth, const int out_height,
+                            const int out_width, const int pad_front,
+                            const int pad_top, const int pad_left,
+                            const int out_d, const int out_h, const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
+                            const int in_depth, const int in_height,
+                            const int in_width, const int out_depth,
+                            const int out_height, const int out_width,
+                            const int pad_front, const int pad_top,
+                            const int pad_left, const int out_d,
+                            const int out_h, const int out_w, const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DNCDHW(const T* in_data, const int num, const int channels,
+                const int in_depth, const int in_height, const int in_width,
+                const int out_depth, const int out_height, const int out_width,
+                const int pad_front, const int pad_top, const int pad_left,
+                T value, T* out_data,
+                void (*pad_func)(const T*, T*, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(in_data, out_data, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, out_d, out_h, out_w, value);
+          }
+        }
+      }
+      in_data += in_depth * in_height * in_width;
+      out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DNDHWC(const T* in_data, const int num, const int channels,
+                const int in_depth, const int in_height, const int in_width,
+                const int out_depth, const int out_height, const int out_width,
+                const int pad_front, const int pad_top, const int pad_left,
+                T value, T* out_data,
+                void (*pad_func)(const T*, T*, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const int, const int,
+                                 const int, const int, const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(in_data, out_data, channels, in_depth, in_height, in_width,
+                   out_depth, out_height, out_width, pad_front, pad_top,
+                   pad_left, out_d, out_h, out_w, value);
+        }
+      }
+    }
+    in_data += in_depth * in_height * in_width * channels;
+    out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth,
+                         const int in_height, const int in_width,
+                         const int out_depth, const int out_height,
+                         const int out_width, const int pad_front,
+                         const int pad_top, const int pad_left, const int out_d,
+                         const int out_h, const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
+        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels,
+                         const int in_depth, const int in_height,
+                         const int in_width, const int out_depth,
+                         const int out_height, const int out_width,
+                         const int pad_front, const int pad_top,
+                         const int pad_left, const int out_d, const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      d_in_data[in_index + c] = d_out_data[out_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                           const int in_depth, const int in_height,
+                           const int in_width, const int out_depth,
+                           const int out_height, const int out_width,
+                           const int pad_front, const int pad_top,
+                           const int pad_left, const int out_d, const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                           const int channels, const int in_depth,
+                           const int in_height, const int in_width,
+                           const int out_depth, const int out_height,
+                           const int out_width, const int pad_front,
+                           const int pad_top, const int pad_left,
+                           const int out_d, const int out_h, const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                             const int in_depth, const int in_height,
+                             const int in_width, const int out_depth,
+                             const int out_height, const int out_width,
+                             const int pad_front, const int pad_top,
+                             const int pad_left, const int out_d,
+                             const int out_h, const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                             const int channels, const int in_depth,
+                             const int in_height, const int in_width,
+                             const int out_depth, const int out_height,
+                             const int out_width, const int pad_front,
+                             const int pad_top, const int pad_left,
+                             const int out_d, const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
+                            const int in_depth, const int in_height,
+                            const int in_width, const int out_depth,
+                            const int out_height, const int out_width,
+                            const int pad_front, const int pad_top,
+                            const int pad_left, const int out_d,
+                            const int out_h, const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
+                            const int channels, const int in_depth,
+                            const int in_height, const int in_width,
+                            const int out_depth, const int out_height,
+                            const int out_width, const int pad_front,
+                            const int pad_top, const int pad_left,
+                            const int out_d, const int out_h, const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels,
+                    const int in_depth, const int in_height, const int in_width,
+                    const int out_depth, const int out_height,
+                    const int out_width, const int pad_front, const int pad_top,
+                    const int pad_left, const T* d_out_data,
+                    void (*pad_func)(T*, const T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(d_in_data, d_out_data, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, out_d, out_h, out_w);
+          }
+        }
+      }
+      d_in_data += in_depth * in_height * in_width;
+      d_out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels,
+                    const int in_depth, const int in_height, const int in_width,
+                    const int out_depth, const int out_height,
+                    const int out_width, const int pad_front, const int pad_top,
+                    const int pad_left, const T* d_out_data,
+                    void (*pad_func)(T*, const T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(d_in_data, d_out_data, channels, in_depth, in_height,
+                   in_width, out_depth, out_height, out_width, pad_front,
+                   pad_top, pad_left, out_d, out_h, out_w);
+        }
+      }
+    }
+    d_in_data += in_depth * in_height * in_width * channels;
+    d_out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+static inline std::vector<int> GetPaddings(
+    const framework::ExecutionContext& context) {
+  std::vector<int> paddings(6);
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    auto paddings_data = paddings_t->data<int>();
+    std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int));
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings.data());
+  }
+  return paddings;
+}
+
+template <typename T>
+class Pad3dCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = static_cast<T>(context.Attr<float>("value"));
+
+    auto* x = context.Input<Tensor>("X");
+    auto in_dims = x->dims();
+    const T* in_data = x->data<T>();
+
+    auto* out = context.Output<Tensor>("Out");
+    if (data_format == "NCDHW") {
+      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5],
+                   in_dims[3] + pads[2] + pads[3],
+                   in_dims[4] + pads[0] + pads[1]});
+    } else {
+      out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5],
+                   in_dims[2] + pads[2] + pads[3],
+                   in_dims[3] + pads[0] + pads[1], in_dims[4]});
+    }
+    auto out_dims = out->dims();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    int channels = in_dims[1];
+    int in_depth = in_dims[2];
+    int in_height = in_dims[3];
+    int in_width = in_dims[4];
+    int out_depth = out_dims[2];
+    int out_height = out_dims[3];
+    int out_width = out_dims[4];
+    if (data_format == "NDHWC") {
+      channels = in_dims[4];
+      in_depth = in_dims[1];
+      in_height = in_dims[2];
+      in_width = in_dims[3];
+      out_depth = out_dims[1];
+      out_height = out_dims[2];
+      out_width = out_dims[3];
+    }
+
+    if (mode == "reflect") {
+      PADDLE_ENFORCE_GT(in_depth, pads[4],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_front"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_front(%d).",
+                            in_depth, pads[4]));
+      PADDLE_ENFORCE_GT(in_depth, pads[5],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_back"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_back(%d).",
+                            in_depth, pads[5]));
+
+      PADDLE_ENFORCE_GT(in_height, pads[2],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_top"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_top(%d).",
+                            in_height, pads[2]));
+      PADDLE_ENFORCE_GT(in_height, pads[3],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_bottom"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_bottom(%d).",
+                            in_height, pads[3]));
+
+      PADDLE_ENFORCE_GT(in_width, pads[0],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_left"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_left(%d).",
+                            in_width, pads[0]));
+      PADDLE_ENFORCE_GT(in_width, pads[1],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_right"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_right(%d).",
+                            in_width, pads[1]));
+    }
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = in_dims[0];
+    if (data_format == "NCDHW") {
+      std::map<std::string,
+               void (*)(const T*, T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int, const T)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DFuncNCDHW;
+      func_map["replicate"] = ReplicatePad3DFuncNCDHW;
+      func_map["circular"] = CircularPad3DFuncNCDHW;
+      func_map["constant"] = ConstPad3DFuncNCDHW;
+      Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width,
+                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+                 value, out_data, func_map[mode]);
+    } else {
+      std::map<std::string, void (*)(const T*, T*, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const int,
+                                     const int, const int, const int, const T)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DFuncNDHWC;
+      func_map["replicate"] = ReplicatePad3DFuncNDHWC;
+      func_map["circular"] = CircularPad3DFuncNDHWC;
+      func_map["constant"] = ConstPad3DFuncNDHWC;
+      Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width,
+                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+                 value, out_data, func_map[mode]);
+    }
+  }
+};
+
+template <typename T>
+class Pad3dGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+             d_in, static_cast<T>(0));
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = d_in_dims[0];
+    if (data_format == "NCDHW") {
+      const int channels = d_in_dims[1];
+      const int in_depth = d_in_dims[2];
+      const int in_height = d_in_dims[3];
+      const int in_width = d_in_dims[4];
+      const int out_depth = d_out_dims[2];
+      const int out_height = d_out_dims[3];
+      const int out_width = d_out_dims[4];
+
+      std::map<std::string,
+               void (*)(T*, const T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DGradNCDHW;
+      func_map["replicate"] = ReplicatePad3DGradNCDHW;
+      func_map["circular"] = CircularPad3DGradNCDHW;
+      func_map["constant"] = ConstPad3DGradNCDHW;
+
+      Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, d_out_data, func_map[mode]);
+    } else {
+      const int channels = d_in_dims[4];
+      const int in_depth = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_depth = d_out_dims[1];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+
+      std::map<std::string,
+               void (*)(T*, const T*, const int, const int, const int,
+                        const int, const int, const int, const int, const int,
+                        const int, const int, const int, const int, const int)>
+          func_map;
+
+      func_map["reflect"] = ReflectPad3DGradNDHWC;
+      func_map["replicate"] = ReplicatePad3DGradNDHWC;
+      func_map["circular"] = CircularPad3DGradNDHWC;
+      func_map["constant"] = ConstPad3DGradNDHWC;
+
+      Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width,
+                     out_depth, out_height, out_width, pad_front, pad_top,
+                     pad_left, d_out_data, func_map[mode]);
+    }
+  }
+};
+
+class Pad3dOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 5,
+                      platform::errors::InvalidArgument(
+                          "The size of Input(X)'s dimension should be equal to "
+                          "5, but received %d. ",
+                          x_dim.size()));
+
+    std::vector<int64_t> out_dims(x_dim.size());
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    out_dims[0] = x_dim[0];
+    if (ctx->HasInput("Paddings")) {
+      auto paddings_dim = ctx->GetInputDim("Paddings");
+      PADDLE_ENFORCE_EQ(paddings_dim.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Size of Input(Paddings)'s dimension should be "
+                            "equal to 1, but received %d.",
+                            paddings_dim.size()));
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(paddings_dim[0], 6,
+                          platform::errors::InvalidArgument(
+                              "Shape of Input(Paddings) should be equal to "
+                              "[6], but received [%d].",
+                              paddings_dim[0]));
+      }
+      out_dims[1] = x_dim[1];
+      out_dims[2] = x_dim[2];
+      out_dims[3] = x_dim[3];
+    } else {
+      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE_EQ(
+          paddings.size(), 6,
+          platform::errors::InvalidArgument(
+              "Size of paddings should be equal to 4, but received %d.",
+              static_cast<int>(paddings.size())));
+      if (data_format == "NCDHW") {
+        out_dims[1] = x_dim[1];  // channel
+        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
+                          ? x_dim[2]
+                          : (x_dim[2] + paddings[4] + paddings[5]);  // depth
+
+        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
+                          ? x_dim[3]
+                          : (x_dim[3] + paddings[2] + paddings[3]);  // height
+
+        out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0))
+                          ? x_dim[4]
+                          : (x_dim[4] + paddings[0] + paddings[1]);  // width
+      } else {                                                       // NDHWC
+        out_dims[4] = x_dim[4];                                      // channel
+
+        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
+                          ? x_dim[1]
+                          : (x_dim[1] + paddings[4] + paddings[5]);  // depth
+        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
+                          ? x_dim[2]
+                          : (x_dim[2] + paddings[2] + paddings[3]);  // height
+        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
+                          ? x_dim[3]
+                          : (x_dim[3] + paddings[0] + paddings[1]);  // width
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class Pad3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad3d op. "
+             "The input should be a 5-D tensor with formate NCDHW or NDHWC.");
+    AddOutput("Out",
+              "The output of pad3d op. "
+              "A tensor with the same shape as X.");
+    AddInput("Paddings",
+             "A 1-D tensor to describe the padding rules."
+             "paddings=[0, 1, 2, 3, 4, 5] means "
+             "padding 0 column to left, 1 column to right, "
+             "2 row to top, 3 row to bottom, 4 depth to front "
+             "and 5 depth to back. Size of paddings must be 6.")
+        .AsDispensable();
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules."
+        "paddings=[0, 1, 2, 3, 4, 5] means "
+        "padding 0 column to left, 1 column to right, "
+        "2 row to top, 3 row to bottom, 4 depth to front "
+        "and 5 depth to back. Size of paddings must be 6.");
+    AddAttr<float>("value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas in constant mode.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>(
+        "mode",
+        "(string, default constant) "
+        "Four modes: constant(default), reflect, replicate, circular.")
+        .SetDefault("constant");
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCDHW) Only used in "
+        "An optional string from: \"NDHWC\", \"NCDHW\". "
+        "Defaults to \"NDHWC\". Specify the data format of the input data.")
+        .SetDefault("NCDHW");
+    AddComment(R"DOC(
+Pad3d Operator.
+Pad 3-d images according to 'paddings' and 'mode'. 
+If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+than width-1. The height and depth dimension have the same condition.
+
+Given that X is a channel of image from input:
+
+X = [[[[[1, 2, 3],
+     [4, 5, 6]]]]]
+
+Case 0:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'constant'
+pad_value = 0
+
+Out = [[[[[0. 0. 0. 0. 0. 0. 0.]
+          [0. 0. 1. 2. 3. 0. 0.]
+          [0. 0. 4. 5. 6. 0. 0.]
+          [0. 0. 0. 0. 0. 0. 0.]]]]]
+
+Case 1:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'reflect'
+
+Out = [[[[[6. 5. 4. 5. 6. 5. 4.]
+          [3. 2. 1. 2. 3. 2. 1.]
+          [6. 5. 4. 5. 6. 5. 4.]
+          [3. 2. 1. 2. 3. 2. 1.]]]]]
+
+Case 2:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'replicate'
+
+Out = [[[[[1. 1. 1. 2. 3. 3. 3.]
+          [1. 1. 1. 2. 3. 3. 3.]
+          [4. 4. 4. 5. 6. 6. 6.]
+          [4. 4. 4. 5. 6. 6. 6.]]]]]
+
+Case 3:
+
+paddings = [2, 2, 1, 1, 0, 0],
+mode = 'circular'
+
+Out = [[[[[5. 6. 4. 5. 6. 4. 5.]
+          [2. 3. 1. 2. 3. 1. 2.]
+          [5. 6. 4. 5. 6. 4. 5.]
+          [2. 3. 1. 2. 3. 1. 2.]]]]]
+
+)DOC");
+  }
+};
+
+class Pad3dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d@Grad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "Pad3d@Grad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class Pad3dOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> bind) const override {
+    bind->SetInput("X", this->Input("X"));
+    if (this->HasInput("Paddings")) {
+      bind->SetInput("Paddings", this->Input("Paddings"));
+    }
+    bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    bind->SetAttrMap(this->Attrs());
+    bind->SetType("pad3d_grad");
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker,
+                  ops::Pad3dOpGradMaker<paddle::framework::OpDesc>,
+                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad,
+                  ops::Pad3dOpGradNoNeedBufferVarsInferer);
+REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel<float>,
+                       ops::Pad3dCPUKernel<double>, ops::Pad3dCPUKernel<int>,
+                       ops::Pad3dCPUKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel<float>,
+                       ops::Pad3dGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..672a75389ccf18d11e508ca94d45128b2e7b56b7
--- /dev/null
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -0,0 +1,788 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data,
+                                const int num, const int channels,
+                                const int in_depth, const int in_height,
+                                const int in_width, const int out_depth,
+                                const int out_height, const int out_width,
+                                const int pad_front, const int pad_top,
+                                const int pad_left, T value, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[nc * in_depth * in_height * in_width +
+                      in_d * in_height * in_width + in_h * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data,
+                                const int num, const int channels,
+                                const int in_depth, const int in_height,
+                                const int in_width, const int out_depth,
+                                const int out_height, const int out_width,
+                                const int pad_front, const int pad_top,
+                                const int pad_left, T value, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    const int in_d = out_d - pad_front;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[n * in_depth * in_height * in_width * channels +
+                      in_d * in_height * in_width * channels +
+                      in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data,
+                                  const int num, const int channels,
+                                  const int in_depth, const int in_height,
+                                  const int in_width, const int out_depth,
+                                  const int out_height, const int out_width,
+                                  const int pad_front, const int pad_top,
+                                  const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);                     // reflect by 0
+    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data,
+                                  const int num, const int channels,
+                                  const int in_depth, const int in_height,
+                                  const int in_width, const int out_depth,
+                                  const int out_height, const int out_width,
+                                  const int pad_front, const int pad_top,
+                                  const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data,
+                                   const int num, const int channels,
+                                   const int in_depth, const int in_height,
+                                   const int in_width, const int out_depth,
+                                   const int out_height, const int out_width,
+                                   const int pad_front, const int pad_top,
+                                   const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data,
+                                   const int num, const int channels,
+                                   const int in_depth, const int in_height,
+                                   const int in_width, const int out_depth,
+                                   const int out_height, const int out_width,
+                                   const int pad_front, const int pad_top,
+                                   const int pad_left, T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int in_w = in_index % in_width;
+
+    int nc = in_index / in_width;
+    const int in_h = nc % in_height;
+
+    nc /= in_height;
+    const int in_d = nc % in_depth;
+
+    nc /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+    d_in_data[in_index] =
+        d_out_data[nc * out_depth * out_height * out_width +
+                   out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data,
+                                    const int num, const int channels,
+                                    const int in_depth, const int in_height,
+                                    const int in_width, const int out_depth,
+                                    const int out_height, const int out_width,
+                                    const int pad_front, const int pad_top,
+                                    const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int c = in_index % channels;
+    int n = in_index / channels;
+
+    const int in_w = n % in_width;
+    n /= in_width;
+
+    const int in_h = n % in_height;
+    n /= in_height;
+
+    const int in_d = n % in_depth;
+    n /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+
+    d_in_data[in_index] =
+        d_out_data[n * out_depth * out_height * out_width * channels +
+                   out_d * out_height * out_width * channels +
+                   out_h * out_width * channels + out_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data,
+                                      const int num, const int channels,
+                                      const int in_depth, const int in_height,
+                                      const int in_width, const int out_depth,
+                                      const int out_height, const int out_width,
+                                      const int pad_front, const int pad_top,
+                                      const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data,
+                                      const int num, const int channels,
+                                      const int in_depth, const int in_height,
+                                      const int in_width, const int out_depth,
+                                      const int out_height, const int out_width,
+                                      const int pad_front, const int pad_top,
+                                      const int pad_left, const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, in_depth * 2 - in_d - 2);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNCDHW(
+    const int out_size, T* d_in_data, const int num, const int channels,
+    const int in_depth, const int in_height, const int in_width,
+    const int out_depth, const int out_height, const int out_width,
+    const int pad_front, const int pad_top, const int pad_left,
+    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNDHWC(
+    const int out_size, T* d_in_data, const int num, const int channels,
+    const int in_depth, const int in_height, const int in_width,
+    const int out_depth, const int out_height, const int out_width,
+    const int pad_front, const int pad_top, const int pad_left,
+    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data,
+                                       const int num, const int channels,
+                                       const int in_depth, const int in_height,
+                                       const int in_width, const int out_depth,
+                                       const int out_height,
+                                       const int out_width, const int pad_front,
+                                       const int pad_top, const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data,
+                                       const int num, const int channels,
+                                       const int in_depth, const int in_height,
+                                       const int in_width, const int out_depth,
+                                       const int out_height,
+                                       const int out_width, const int pad_front,
+                                       const int pad_top, const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+static inline std::vector<int> GetPaddings(
+    const framework::ExecutionContext& context) {
+  std::vector<int> paddings(6);
+  auto* paddings_data = context.Input<Tensor>("Paddings");
+  if (paddings_data) {
+    Tensor pads;
+    framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads);
+    auto pads_data = pads.data<int>();
+    std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int));
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings.data());
+  }
+  return paddings;
+}
+
+template <typename T>
+class Pad3dCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = static_cast<T>(context.Attr<float>("value"));
+
+    auto* x = context.Input<Tensor>("X");
+    auto in_dims = x->dims();
+    const T* in_data = x->data<T>();
+    auto* out = context.Output<Tensor>("Out");
+    auto out_dims = out->dims();
+    if (data_format == "NCDHW") {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1];
+      out_dims[2] = in_dims[2] + pads[4] + pads[5];
+      out_dims[3] = in_dims[3] + pads[2] + pads[3];
+      out_dims[4] = in_dims[4] + pads[0] + pads[1];
+    } else {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1] + pads[4] + pads[5];
+      out_dims[2] = in_dims[2] + pads[2] + pads[3];
+      out_dims[3] = in_dims[3] + pads[0] + pads[1];
+      out_dims[4] = in_dims[4];
+    }
+    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
+
+    int channels = in_dims[1];
+    int in_depth = in_dims[2];
+    int in_height = in_dims[3];
+    int in_width = in_dims[4];
+    int out_depth = out_dims[2];
+    int out_height = out_dims[3];
+    int out_width = out_dims[4];
+    if (data_format == "NDHWC") {
+      channels = in_dims[4];
+      in_depth = in_dims[1];
+      in_height = in_dims[2];
+      in_width = in_dims[3];
+      out_depth = out_dims[1];
+      out_height = out_dims[2];
+      out_width = out_dims[3];
+    }
+
+    if (mode == "reflect") {
+      PADDLE_ENFORCE_GT(in_depth, pads[4],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_front"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_front(%d).",
+                            in_depth, pads[4]));
+      PADDLE_ENFORCE_GT(in_depth, pads[5],
+                        platform::errors::InvalidArgument(
+                            "The depth of Input(X)'s dimension should be "
+                            "greater than pad_back"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_back(%d).",
+                            in_depth, pads[5]));
+
+      PADDLE_ENFORCE_GT(in_height, pads[2],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_top"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_top(%d).",
+                            in_height, pads[2]));
+      PADDLE_ENFORCE_GT(in_height, pads[3],
+                        platform::errors::InvalidArgument(
+                            "The height of Input(X)'s dimension should be "
+                            "greater than pad_bottom"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_bottom(%d).",
+                            in_height, pads[3]));
+
+      PADDLE_ENFORCE_GT(in_width, pads[0],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_left"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_left(%d).",
+                            in_width, pads[0]));
+      PADDLE_ENFORCE_GT(in_width, pads[1],
+                        platform::errors::InvalidArgument(
+                            "The width of Input(X)'s dimension should be "
+                            "greater than pad_right"
+                            " in reflect mode"
+                            ", but received depth(%d) and pad_right(%d).",
+                            in_width, pads[1]));
+    }
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+    const int num = in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = out->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCDHW") {
+      if (mode == "reflect") {
+        Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "replicate") {
+        Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "circular") {
+        Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else {
+        Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            value, out_data);
+      }
+    } else {
+      if (mode == "reflect") {
+        Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "replicate") {
+        Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else if (mode == "circular") {
+        Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            out_data);
+      } else {
+        Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::vector<int> pads = GetPaddings(context);
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+             d_in, static_cast<T>(0));
+
+    const int pad_left = pads[0];
+    const int pad_top = pads[2];
+    const int pad_front = pads[4];
+
+    const int num = d_in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = d_out->numel();
+    const int in_size = d_in->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCDHW") {
+      const int channels = d_in_dims[1];
+      const int in_depth = d_in_dims[2];
+      const int in_height = d_in_dims[3];
+      const int in_width = d_in_dims[4];
+      const int out_depth = d_out_dims[2];
+      const int out_height = d_out_dims[3];
+      const int out_width = d_out_dims[4];
+
+      if (mode == "reflect") {
+        Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "replicate") {
+        Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "circular") {
+        Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[4];
+      const int in_depth = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_depth = d_out_dims[1];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "replicate") {
+        Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else if (mode == "circular") {
+        Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
+            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
+            d_out_data);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel<plat::float16>,
+                        ops::Pad3dCUDAKernel<float>,
+                        ops::Pad3dCUDAKernel<double>, ops::Pad3dCUDAKernel<int>,
+                        ops::Pad3dCUDAKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel<plat::float16>,
+                        ops::Pad3dGradCUDAKernel<float>,
+                        ops::Pad3dGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 1ed7988dcfcc0831156c09a72e958852f3d45fb5..70d232ad6a51e21b863974e70920eb2d9da895e6 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -28,25 +28,44 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
                           "Output(Out) of PixelShuffleOp should not be null."));
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be a 4-D tensor of format [N, C, H, W], but got %u.",
-            input_dims.size()));
+    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "Input should be a 4-D tensor of format [N, C, H, W] "
+                          "or [N, H, W, C], but got %u.",
+                          input_dims.size()));
 
     auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
 
-    PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), 0,
-                      platform::errors::InvalidArgument(
-                          "The square of upscale_factor[%u] should divide the "
-                          "number of channel[%u]",
-                          input_dims[1], upscale_factor * upscale_factor));
-
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC");
+
+    if (!channel_last) {
+      PADDLE_ENFORCE_EQ(
+          input_dims[1] % (upscale_factor * upscale_factor), 0,
+          platform::errors::InvalidArgument(
+              "The square of upscale_factor[%u] should divide the "
+              "number of channel[%u]",
+              input_dims[1], upscale_factor * upscale_factor));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          input_dims[3] % (upscale_factor * upscale_factor), 0,
+          platform::errors::InvalidArgument(
+              "The square of upscale_factor[%u] should divide the "
+              "number of channel[%u]",
+              input_dims[3], upscale_factor * upscale_factor));
+    }
     auto output_dims = input_dims;
     output_dims[0] = input_dims[0];
-    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
-    output_dims[2] = input_dims[2] * upscale_factor;
-    output_dims[3] = input_dims[3] * upscale_factor;
+    if (!channel_last) {
+      output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+      output_dims[2] = input_dims[2] * upscale_factor;
+      output_dims[3] = input_dims[3] * upscale_factor;
+    } else {
+      output_dims[1] = input_dims[1] * upscale_factor;
+      output_dims[2] = input_dims[2] * upscale_factor;
+      output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor);
+    }
     ctx->SetOutputDim("Out", output_dims);
   }
 };
@@ -54,14 +73,14 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
 class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(
-        "X",
-        "(Tensor, default Tensor<float>), "
-        "the input feature data of PixelShuffleOp, the layout is [N C H W].");
-    AddOutput(
-        "Out",
-        "(Tensor, default Tensor<float>), the output of "
-        "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PixelShuffleOp, the layout is [N, C, "
+             "H, W] or [N, H, W, C].");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "PixelShuffleOp. The layout is [N, C/factor^2, H*factor, "
+              "W*factor] or [N, H*factor, W*factor, C/factor^2].");
     AddAttr<int>("upscale_factor",
                  "the factor to increase spatial resolution by.")
         .SetDefault(1)
@@ -70,6 +89,11 @@ class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "upscale_factor should be larger than 0."));
         });
+    AddAttr<std::string>(
+        "data_format",
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\", Specify the data format of the input data.")
+        .SetDefault("NCHW");
 
     AddComment(R"DOC(
 		Pixel Shuffle operator
@@ -114,19 +138,30 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
         platform::errors::NotFound("Output(X@Grad) should not be null"));
 
     auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
-        do_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be a 4-D tensor of format [N, C, H, W], but got %u.",
-            do_dims.size()));
+    PADDLE_ENFORCE_EQ(do_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "Input should be a 4-D tensor of format [N, C, H, W] "
+                          "or [N, H, W, C], but got %u.",
+                          do_dims.size()));
 
     auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
 
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC");
+
     auto dx_dims = do_dims;
     dx_dims[0] = do_dims[0];
-    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
-    dx_dims[2] = do_dims[2] / upscale_factor;
-    dx_dims[3] = do_dims[3] / upscale_factor;
+
+    if (!channel_last) {
+      dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
+      dx_dims[2] = do_dims[2] / upscale_factor;
+      dx_dims[3] = do_dims[3] / upscale_factor;
+    } else {
+      dx_dims[1] = do_dims[1] / upscale_factor;
+      dx_dims[2] = do_dims[2] / upscale_factor;
+      dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor);
+    }
     ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
   }
 };
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
index 1ae1c7e9d50cb9d701fd0e79337a1906f2f5d545..b2a0db0f838d5dcc3fed2ed9838f1c43240ce0e7 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -24,23 +25,33 @@ class PixelShuffleOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* out = ctx.Output<framework::Tensor>("Out");
+
     out->mutable_data<T>(ctx.GetPlace());
 
     int factor = ctx.Attr<int>("upscale_factor");
 
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool channel_last = (data_format == "NHWC");
+
     auto in_dims = in->dims();
     auto o_dims = out->dims();
 
     framework::Tensor t;
     t.ShareDataWith(*in);
-    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
-
+    if (!channel_last) {
+      t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+    } else {
+      t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor});
+    }
     std::vector<int> axis = {0, 1, 4, 2, 5, 3};
 
     framework::Tensor o;
     o.ShareDataWith(*out);
-    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
-
+    if (!channel_last) {
+      o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+    } else {
+      o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
+    }
     math::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
@@ -58,19 +69,32 @@ class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
 
     int factor = ctx.Attr<int>("upscale_factor");
 
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool channel_last = (data_format == "NHWC");
+
     auto do_dims = dout->dims();
     auto dx_dims = dx->dims();
 
     framework::Tensor t;
     t.ShareDataWith(*dout);
-    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
-
+    if (!channel_last) {
+      t.Resize(
+          {do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+    } else {
+      t.Resize(
+          {do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]});
+    }
     std::vector<int> axis = {0, 1, 3, 5, 2, 4};
 
     framework::Tensor o;
     o.ShareDataWith(*dx);
-    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
-
+    if (!channel_last) {
+      o.Resize(
+          {do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+    } else {
+      o.Resize(
+          {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
+    }
     math::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 8ff3192ca24e193dfbddebdd1ce79ce91f08dbb2..9900120e6c590f5d0c454fdba3ee4e936c2c409b 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -306,12 +306,16 @@ void Pool2dOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool) Only used in mkldnn kernel. Default False")
       .SetDefault(false);
-  AddAttr<bool>("use_quantizer",
-                "(bool) "
-                "Set to true for operators that should be quantized and use "
-                "int8 kernel. "
-                "Only used on CPU. Default False")
+  AddAttr<bool>(
+      "use_quantizer",
+      "(bool, default false) "
+      "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 354e5c60a6b9ed80f0f8c44439294bfa2731a423..7749903e5f36f1d93f7e111da4587d6828d445a4 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable,
       out->set_lod(py_out_tensor->lod());
       out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
-      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The %d-th output must be LoDTensor.", i));
     }
   }
 }
diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 9f6df3f32b7463b30804555cbce4d4ee8f03a989..662fe3bcb3b3b2d26afaef0c9388dda329aea645 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -14,6 +14,8 @@
 
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
@@ -37,20 +39,30 @@ class CPURandintKernel : public framework::OpKernel<T> {
         new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
       }
     }
-
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     if (!new_shape.empty()) out->Resize(framework::make_ddim(new_shape));
     T* data = out->mutable_data<T>(ctx.GetPlace());
     int64_t size = out->numel();
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
     std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
                                           ctx.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) data[i] = dist(gen_engine);
+    } else {
+      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 64ef1c771423f2d820c73df8ed9ff25834f07875..0eb028ad806848a559ba51b9c950d324a598a851 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/place.h"
@@ -31,11 +32,17 @@ static inline void random_permate(T* data_ptr, int num, unsigned int seed) {
   for (int i = 0; i < num; ++i) {
     data_ptr[i] = static_cast<T>(i);
   }
-  if (seed == 0) {
-    seed = std::random_device()();
+  if (framework::Generator::GetInstance()->is_init_py) {
+    std::shuffle(data_ptr, data_ptr + num,
+                 framework::Generator::GetInstance()->GetCPUEngine());
+
+  } else {
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    std::srand(seed);
+    std::random_shuffle(data_ptr, data_ptr + num);
   }
-  std::srand(seed);
-  std::random_shuffle(data_ptr, data_ptr + num);
 }
 
 template <typename DeviceContext, typename T>
@@ -51,6 +58,7 @@ class RandpermKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(ctx.GetPlace())) {
       T* out_data = out_tensor->mutable_data<T>(platform::CPUPlace());
       random_permate<T>(out_data, n, seed);
+
     } else {
       framework::Tensor tmp_tensor;
       tmp_tensor.Resize(framework::make_ddim({n}));
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index e0bcab1fb547afd6250e73c309cd61d343e631ff..f13b0d800bdc7fea72010069b3f36ebe1e04488a 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -36,15 +36,30 @@ BufferedReader::~BufferedReader() {
 
 BufferedReader::BufferedReader(
     const std::shared_ptr<framework::ReaderBase> &reader,
-    const platform::Place &place, size_t buffer_size)
+    const platform::Place &place, size_t buffer_size, bool pin_memory)
     : framework::DecoratedReader(reader),
       thread_pool_(1),
       place_(place),
-      buffer_size_(buffer_size) {
+      buffer_size_(buffer_size),
+      pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place_) && !pin_memory) {
+    int dev_idx = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+    compute_stream_ =
+        ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
+                                             .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::CudaEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
-  cuda_pinned_buffer_.resize(buffer_size);
+  cuda_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -65,47 +80,103 @@ void BufferedReader::ReadAsync(size_t i) {
 
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
-      // NOTE: [Copy processing of different input devices]
-      // We may accept input tensor in three different devices:
-      //   - CPUPlace
-      //   - CUDAPinnedPlace
-      //   - CUDAPlace
-      // CUDA Stream Synchronizing is slow, in order to avoid Synchronizing
-      // in BufferedReader thread, we do data copy as follows:
-      //   - If src Tensor on CPU memory, we copy it to CUDAPinned memory
-      //   - IF src Tensor on CUDAPinned memory, we use it directly
-      //   - IF src Tensor on CUDA memory, we use it directly
-      platform::CUDAPinnedPlace cuda_pinned_place;
-      TensorVec &cuda_pinned = cuda_pinned_buffer_[i];
-      if (cuda_pinned.empty()) {
-        cuda_pinned.resize(cpu.size());
+      TensorVec &cuda = cuda_buffer_[i];
+      if (cuda.empty()) {
+        cuda.resize(cpu.size());
       } else {
         PADDLE_ENFORCE_EQ(
-            cuda_pinned.size(), cpu.size(),
+            cuda.size(), cpu.size(),
             platform::errors::InvalidArgument(
                 "Input tensor number on GPU and CPU devices are not matched."));
       }
+      if (pin_memory_) {
+        // NOTE: [Copy processing of different input devices]
+        // We may accept input tensor in three different devices:
+        //   - CPUPlace
+        //   - CUDAPinnedPlace
+        //   - CUDAPlace
+        // CUDA Stream Synchronizing is slow, in order to avoid Synchronizing
+        // in BufferedReader thread, we do data copy as follows:
+        //   - If src Tensor on CPU memory, we copy it to CUDAPinned memory
+        //   - IF src Tensor on CUDAPinned memory, we use it directly
+        //   - IF src Tensor on CUDA memory, we use it directly
+        platform::CUDAPinnedPlace cuda_pinned_place;
+        std::vector<void *> cuda_pinned_ptrs;
+        cuda_pinned_ptrs.reserve(cpu.size());
+        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        for (size_t i = 0; i < cpu.size(); ++i) {
+          if (platform::is_cpu_place(cpu[i].place())) {
+            cuda[i].Resize(cpu[i].dims());
+            cuda[i].set_layout(cpu[i].layout());
+            cuda_pinned_ptrs.emplace_back(
+                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+            auto size =
+                cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
+
+            memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
+                         BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
+                         cpu[i].data<void>(), size);
+            cuda[i].set_lod(cpu[i].lod());
+          } else {
+            // we set same place flag & use cpu[i] directly
+            is_same_place_ = true;
+          }
+        }
+      } else {
+        // NOTE(liangdun): using async copy instead of TensorCopySync
+        // TensorCopySync would block other stream, because TensorCopySync
+        // issues the copying command to the default stream, it will make two
+        // commands from different streams cannot run concurrently.
+        std::vector<void *> gpu_ptrs;
+        gpu_ptrs.reserve(cpu.size());
+        for (size_t i = 0; i < cpu.size(); ++i) {
+          cuda[i].Resize(cpu[i].dims());
+          cuda[i].set_layout(cpu[i].layout());
+          gpu_ptrs.emplace_back(cuda[i].mutable_data(place_, cpu[i].type()));
+        }
 
-      std::vector<void *> cuda_pinned_ptrs;
-      cuda_pinned_ptrs.reserve(cpu.size());
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
-      for (size_t i = 0; i < cpu.size(); ++i) {
-        if (platform::is_cpu_place(cpu[i].place())) {
-          cuda_pinned[i].Resize(cpu[i].dims());
-          cuda_pinned[i].set_layout(cpu[i].layout());
-          cuda_pinned_ptrs.emplace_back(
-              cuda_pinned[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+        // NOTE(zjl): cudaStreamWaitEvent() must be called after all
+        // cuda[i].mutable_data() is called, since some ops release
+        // cuda memory immediately without waiting cuda kernel ends
+        platform::SetDeviceId(
+            BOOST_GET_CONST(platform::CUDAPlace, place_).device);
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            cudaEventRecord(events_[i].get(), compute_stream_));
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
+
+        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        for (size_t i = 0; i < cpu.size(); ++i) {
+          auto cpu_place = cpu[i].place();
+          auto cpu_ptr = cpu[i].data<void>();
+          auto gpu_ptr = gpu_ptrs[i];
           auto size =
               cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
-
-          memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
-                       BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
-                       cpu[i].data<void>(), size);
-          cuda_pinned[i].set_lod(cpu[i].lod());
-        } else {
-          // we set same place flag & use cpu[i] directly
-          is_same_place_ = true;
+          if (platform::is_cuda_pinned_place(cpu_place)) {
+            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
+                         BOOST_GET_CONST(platform::CUDAPinnedPlace, cpu_place),
+                         cpu_ptr, size, stream_.get());
+          } else if ((platform::is_gpu_place(cpu_place))) {
+            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
+                         BOOST_GET_CONST(platform::CUDAPlace, cpu_place),
+                         cpu_ptr, size, stream_.get());
+          } else {
+            platform::CUDAPinnedPlace cuda_pinned_place;
+            framework::LoDTensor cuda_pinned_tensor;
+            cuda_pinned_tensor.Resize(cpu[i].dims());
+            auto cuda_pinned_ptr = cuda_pinned_tensor.mutable_data(
+                cuda_pinned_place, cpu[i].type());
+            memory::Copy(cuda_pinned_place, cuda_pinned_ptr,
+                         BOOST_GET_CONST(platform::CPUPlace, cpu_place),
+                         cpu_ptr, size);
+            memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
+                         cuda_pinned_place, cuda_pinned_ptr, size,
+                         stream_.get());
+            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
+          }
+          cuda[i].set_lod(cpu[i].lod());
         }
+        PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
       }
     }
 #endif
@@ -141,7 +212,7 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
   }
 
   *out = std::move((platform::is_gpu_place(place_) && !is_same_place_)
-                       ? cuda_pinned_buffer_[i]
+                       ? cuda_buffer_[i]
                        : cpu_buffer_[i]);
 
   // Do not push current position into ReadAsync. Push the previous position
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 4409aa4d399419a651e01ce7e279525916a29781..42c087b9e47a9ec7e80d05a791af3e04c483ab08 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -35,7 +35,8 @@ class BufferedReader : public framework::DecoratedReader {
 
  public:
   BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
-                 const platform::Place& place, size_t buffer_size);
+                 const platform::Place& place, size_t buffer_size,
+                 bool pin_memory = false);
 
   ~BufferedReader() override;
 
@@ -53,6 +54,7 @@ class BufferedReader : public framework::DecoratedReader {
   ThreadPool thread_pool_;
   platform::Place place_;
   const size_t buffer_size_;
+  bool pin_memory_;
 
   std::queue<std::future<size_t>> position_;
 
@@ -63,8 +65,13 @@ class BufferedReader : public framework::DecoratedReader {
   // buffers and prevent alloc every time.
   bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
-  std::vector<TensorVec> cuda_pinned_buffer_;
+  std::vector<TensorVec> cuda_buffer_;
   size_t prev_pos_{-1UL};
+#ifdef PADDLE_WITH_CUDA
+  cudaStream_t compute_stream_;
+  std::shared_ptr<platform::CudaStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..322a1637f5deec909db13f1bd0433446cd7606ae
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class LogsumexpOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "logsumexp"; }
+  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+};
+
+template <typename T>
+class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("logsumexp_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+                  ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
+                  ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+
+REGISTER_OP_CPU_KERNEL(logsumexp,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::LogsumexpFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::LogsumexpFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c25e5d01b2758a96192d6fbf8f4e881770cbbbf0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+
+REGISTER_OP_CUDA_KERNEL(logsumexp,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::LogsumexpFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::LogsumexpGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::LogsumexpGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d0e00262a37ff7160abd7a865e63377f8b30461
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct LogsumexpFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    auto x_dim = x->dimensions();
+    auto t_dim = x_dim;
+    for (int i = 0; i < static_cast<int>(dim.size()); i++) {
+      t_dim[dim[i]] = 1;
+    }
+
+    auto r_dim = x_dim;
+    for (int i = 0; i < static_cast<int>(r_dim.size()); i++) {
+      r_dim[i] = 1;
+    }
+    for (int i = 0; i < static_cast<int>(dim.size()); i++) {
+      r_dim[dim[i]] = x_dim[dim[i]];
+    }
+
+    auto y_dim = y->dimensions();
+    auto x_max = x->maximum(dim);
+    y->device(place) =
+        (x_max +
+         (*x - x_max.reshape(t_dim).broadcast(r_dim)).exp().sum(dim).log())
+            .reshape(y_dim);
+  }
+};
+
+struct LogsumexpGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * (*x - y->broadcast(dim)).exp();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index fccf6d46895ff46c40d0a5c20d4cf1b614ad8a9e..fdb2c57385b2bc1068c618f206bfeb6513d3d8c4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -103,11 +103,7 @@ REGISTER_OP_CPU_KERNEL(reduce_mean,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                          float, ops::MeanFunctor>,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::MeanFunctor>);
+                                         double, ops::MeanFunctor>);
 
 template <typename T>
 using CPUReduceMeanGradKernel =
@@ -115,6 +111,4 @@ using CPUReduceMeanGradKernel =
                           ops::MeanGradFunctor, true>;
 
 REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>,
-                       CPUReduceMeanGradKernel<int>,
-                       CPUReduceMeanGradKernel<int64_t>);
+                       CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index 4d3bce8fdd05e536baa5fecb4fc5a117e2031224..cc3653fcb43a4c000d0c61c9d854965fafd59a9c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -66,6 +66,4 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
-                        ops::ReduceMeanKernel<double>,
-                        ops::ReduceMeanKernel<int>,
-                        ops::ReduceMeanKernel<int64_t>);
+                        ops::ReduceMeanKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
index 12eceb33ec27298d60713e72c9cc2cf91a5e7cfb..289f574719ff03b1b09f313d05bab152f5c5d651 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
@@ -21,6 +21,4 @@ using CUDAReduceMeanGradKernel =
                           ops::MeanGradFunctor, true>;
 
 REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>,
-                        CUDAReduceMeanGradKernel<int>,
-                        CUDAReduceMeanGradKernel<int64_t>);
+                        CUDAReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 4673dc258d062b219fb90f644265cbaa4cfb82ef..67a19cb83c36f9cb6ef0cdd65e9fc04a7bb4d169 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -98,6 +99,18 @@ class ReduceKernel : public framework::OpKernel<T> {
     int out_dtype = context.Attr<int>("out_dtype");
     framework::proto::VarType::Type cast_out_dtype;
 
+    // The dims has full dim, set the reduce_all is True
+    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    std::set<int> dims_set(dims.begin(), dims.end());
+    bool full_dim = true;
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) == dims_set.end()) {
+        full_dim = false;
+        break;
+      }
+    }
+    reduce_all = (reduce_all || full_dim);
+
     if (out_dtype < 0) {
       auto* cast_input = context.Input<Tensor>("X");
       cast_out_dtype =
@@ -137,6 +150,18 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
     auto dims = context.Attr<std::vector<int>>("dim");
     bool keep_dim = context.Attr<bool>("keep_dim");
 
+    // The dims has full dim, set the reduce_all is True
+    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    std::set<int> dims_set(dims.begin(), dims.end());
+    bool full_dim = true;
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) == dims_set.end()) {
+        full_dim = false;
+        break;
+      }
+    }
+    reduce_all = (reduce_all || full_dim);
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
       auto x = EigenVector<OutT>::Flatten(*input);
@@ -183,6 +208,17 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto* output = context.Output<Tensor>(framework::GradVarName("X"));
     output->mutable_data<T>(context.GetPlace());
 
+    // The dims has full dim, set the reduce_all is True
+    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    std::set<int> dims_set(dims.begin(), dims.end());
+    bool full_dim = true;
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) == dims_set.end()) {
+        full_dim = false;
+        break;
+      }
+    }
+    reduce_all = (reduce_all || full_dim);
     // NOTE: EigenTensor::From() uses tensor->data()
     // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
     // kNoNeedBufferY should set true
@@ -200,8 +236,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
 
     if (reduce_all) {
       auto x = EigenVector<T>::Flatten(*input0);
-      auto x_reduce = EigenVector<T>::From(*input1);
-      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_reduce = EigenVector<T>::Flatten(*input1);
+      auto x_reduce_grad = EigenVector<T>::Flatten(*input2);
       auto x_grad = EigenVector<T>::Flatten(*output);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
@@ -298,6 +334,12 @@ class ReduceOp : public framework::OperatorWithKernel {
                             "range [-dimension(X), dimension(X)] "
                             "which dimesion = %d. But received dim index = %d.",
                             i, x_rank, dims[i]));
+      PADDLE_ENFORCE_GE(dims[i], -x_rank,
+                        platform::errors::InvalidArgument(
+                            "The reduce dim index %d should be in the "
+                            "range [-dimension(X), dimension(X)] "
+                            "which dimesion = %d. But received dim index = %d.",
+                            i, x_rank, dims[i]));
       if (dims[i] < 0) dims[i] = x_rank + dims[i];
     }
     sort(dims.begin(), dims.end());
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index fee0f045825591d548350c289f3f290d5dd1d723..01a33a46521cd81d084f8971c47741b28a105d41 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -431,13 +431,16 @@ class Reshape2OpMaker : public ReshapeOpMaker {
               "XShape is just used to store the shape and lod of X, which will "
               "be used in FlattenGradOp.")
         .AsIntermediate();
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Used only on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
   }
 };
 
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index c0fbc336e46b64fc6ee43ef1a7372e413c5c3213..1c493fc6be093a2af8f58c8e78d1be43de34306f 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -29,6 +29,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+DECLARE_bool(use_mkldnn);
 
 namespace paddle {
 namespace operators {
@@ -262,6 +267,9 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     }
     VLOG(2) << "The number of sub scopes after forward: "
             << out_scope_vec->front()->kids().size();
+#ifdef PADDLE_WITH_MKLDNN
+    if (FLAGS_use_mkldnn) DontClearMKLDNNCache(ctx.GetPlace());
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 5ec32c98f7f84abb255ec996d0cf6a58e6312ec3..a09220b1ccd13604b6d842237c8176578967ac64 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -61,7 +62,9 @@ class SamplingIdKernel : public framework::OpKernel<T> {
 
     std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
-      T r = dist(engine);
+      T r = framework::Generator::GetInstance()->is_init_py
+                ? dist(framework::Generator::GetInstance()->GetCPUEngine())
+                : dist(engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
         if ((r -= ins_vector[i * width + j]) < 0) {
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 7c77b2688e7b528f678418c67e77fa4abff04248..0adf61d7ce3e5b5792b9dc65d5ac8f884dc81ea5 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/selu_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,11 +31,7 @@ class SeluOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "selu");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "selu");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
+    return UnaryOpUnchangedInferShape(ctx);
   }
 
  protected:
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 06eaca0216b36a50028fd7cfd3c0866a5b7c1de0..b45fa7c791ff22be422ce12a8348a071c60ddd0f 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -54,5 +54,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
+                       ops::SizeKernel<paddle::platform::float16>,
                        ops::SizeKernel<float>, ops::SizeKernel<double>,
                        ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
index 4e5846660e62543638b669d586a92fc36b0c8e87..3ea3032693236d5618ff6f0c858cbd85e34633ab 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/size_op.h"
 
-REGISTER_OP_CUDA_KERNEL(size, paddle::operators::SizeKernel<int>,
-                        paddle::operators::SizeKernel<int32_t>,
-                        paddle::operators::SizeKernel<float>,
-                        paddle::operators::SizeKernel<bool>,
-                        paddle::operators::SizeKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    size, paddle::operators::SizeKernel<int>,
+    paddle::operators::SizeKernel<int32_t>,
+    paddle::operators::SizeKernel<paddle::platform::float16>,
+    paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
+    paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 8f5df7b6d5d3cb6cee6f08edaeaa4269c70be937..d147ec3e407b0382a0ed7311cfebbe49bf14134d 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -155,6 +155,10 @@ class SliceOp : public framework::OperatorWithKernel {
           in_tensor.IsInitialized(), true,
           platform::errors::InvalidArgument(
               "The tensor Input (Input) of Slice op is not initialized."));
+      // NOTE: cuda pinned tensor need to copy its data to target place
+      if (platform::is_cuda_pinned_place(in_tensor.place())) {
+        return framework::OpKernelType(in_tensor.type(), ctx.device_context());
+      }
       return framework::OpKernelType(in_tensor.type(), in_tensor.place());
     }
     return framework::OpKernelType(
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 859776bc2a0f0056224b69f74a7e423ff2dd0a01..93d8f42ce2175bbf554eef9892f4da4b9da524ec 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -304,6 +304,7 @@ REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
 REGISTER_OP_CPU_KERNEL(
     squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -311,12 +312,14 @@ REGISTER_OP_CPU_KERNEL(
     squeeze_grad,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -324,6 +327,7 @@ REGISTER_OP_CPU_KERNEL(
     squeeze2_grad,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
index 61a3a39de4a3f16135ea39afbba458dbfc9aa734..f469118fae7099999389076733c9143d56c0e770 100644
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
     squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -29,6 +30,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -36,6 +38,7 @@ REGISTER_OP_CUDA_KERNEL(
     squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -44,6 +47,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 7528422fdc09b7894898bdee94eaa11ad2cba311..f20bada8ab288fe74fd8ca82a73522a22b234191 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca8f6ce84fc571674fdfe6f29cbcd82a98fd8fcf
--- /dev/null
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/imperative/infer_shape_context.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+
+USE_OP(relu);
+USE_OP(elementwise_add);
+USE_OP(softmax);
+
+namespace paddle {
+namespace operators {
+namespace details {
+
+class DygraphInferShapeTest {
+ public:
+  void AddInput(const std::string& name, const framework::DDim& dim) {
+    std::shared_ptr<imperative::VarBase> vin(
+        new imperative::VarBase(false, name));
+    vin->MutableVar()->GetMutable<framework::LoDTensor>()->Resize(dim);
+    ins_[name] = {vin};
+  }
+  void AddOutput(const std::string& name, const framework::DDim& expected_dim) {
+    std::shared_ptr<imperative::VarBase> vout(
+        new imperative::VarBase(false, name));
+    vout->MutableVar()
+        ->GetMutable<framework::LoDTensor>();  // InitializeVariable
+    outs_[name] = {vout};
+    expected_dims_[name] = expected_dim;
+  }
+  void AddAttrs(const framework::AttributeMap& attrs) { attrs_ = attrs; }
+  void SetOpType(const std::string& op_type) { op_type_ = op_type; }
+  void Run(std::function<void(framework::InferShapeContext* ctx)> infer_shape) {
+    imperative::DygraphInferShapeContext<imperative::VarBase> ctx(
+        &ins_, &outs_, &attrs_, op_type_);
+    infer_shape(&ctx);
+    for (const auto& pair : expected_dims_) {
+      auto out = outs_[pair.first][0];
+      ASSERT_EQ(pair.second,
+                out->MutableVar()->GetMutable<framework::LoDTensor>()->dims());
+    }
+  }
+
+ private:
+  imperative::NameVarBaseMap ins_;
+  imperative::NameVarBaseMap outs_;
+  framework::AttributeMap attrs_;
+  std::string op_type_;
+  std::map<std::string, framework::DDim> expected_dims_;
+};
+}  // namespace details
+
+TEST(test_UnaryOpUnchangedInferShape, test_shape) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10});
+  test.AddOutput("Out", {2, 10});
+  test.SetOpType("relu");
+  test.Run(UnaryOpUnchangedInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_same_shape) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 3, 4, 5});
+  test.AddInput("Y", {2, 3, 4, 5});
+  test.AddOutput("Out", {2, 3, 4, 5});
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_broadcast1) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 3, 4, 5});
+  test.AddInput("Y", {4, 5});
+  test.AddOutput("Out", {2, 3, 4, 5});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_broadcast2) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10, 5, 1});
+  test.AddInput("Y", {10, 1, 1});
+  test.AddOutput("Out", {2, 10, 5, 1});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_BinaryOpBroadcastInferShape, test_broadcast3) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {10, 1, 1});
+  test.AddInput("Y", {2, 10, 5, 5});
+  test.AddOutput("Out", {2, 10, 5, 5});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("elementwise_add");
+  test.Run(BinaryOpBroadcastInferShape);
+}
+
+TEST(test_UnaryOpUnchangedInferShapeCheckAxis, test_shape) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10});
+  test.AddOutput("Out", {2, 10});
+  test.AddAttrs({
+      {"axis", -1},
+  });
+  test.SetOpType("softmax");
+  test.Run(UnaryOpUnchangedInferShapeCheckAxis);
+}
+
+TEST(test_UnaryOpUnchangedInferShapeCheckAxis, test_axis_exception) {
+  details::DygraphInferShapeTest test;
+  test.AddInput("X", {2, 10});
+  test.AddOutput("Out", {2, 10});
+  test.AddAttrs({
+      {"axis", 2},
+  });
+  test.SetOpType("softmax");
+  ASSERT_ANY_THROW(test.Run(UnaryOpUnchangedInferShapeCheckAxis));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index f416aa6e00f5a4a82c2562c36f9d32bb1a6843aa..cc2fe4cdbdb8faa69abad28fbdd31dc4e61bdc04 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -41,12 +41,12 @@ static void InitRandom(framework::Tensor *tensor,
 
 template <typename T>
 struct LeakyReluGradGradEachElementFunctor {
-  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *out, T alpha,
+  LeakyReluGradGradEachElementFunctor(const T *ddx, const T *x, T alpha,
                                       T *ddout)
-      : ddx_(ddx), out_(out), alpha_(alpha), ddout_(ddout) {}
+      : ddx_(ddx), x_(x), alpha_(alpha), ddout_(ddout) {}
 
   HOSTDEVICE void operator()(int idx) {
-    if (out_[idx] > 0) {
+    if (x_[idx] >= 0) {
       ddout_[idx] = ddx_[idx];
     } else {
       ddout_[idx] = ddx_[idx] * alpha_;
@@ -54,7 +54,7 @@ struct LeakyReluGradGradEachElementFunctor {
   }
 
   const T *ddx_;
-  const T *out_;
+  const T *x_;
   T alpha_;
   T *ddout_;
 };
@@ -66,13 +66,13 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   LeakyReluGradGradFunctor<T> functor;
   functor.alpha = alpha;
   auto &dev_ctx = *platform::DeviceContextPool::Instance().Get(place);
-  framework::Tensor *x = nullptr;
+  framework::Tensor *out = nullptr;
   framework::Tensor *dout = nullptr;
   framework::Tensor *dx = nullptr;
 
-  framework::Tensor out;
-  out.Resize(dim);
-  InitRandom<T>(&out, place);
+  framework::Tensor x;
+  x.Resize(dim);
+  InitRandom<T>(&x, place);
 
   framework::Tensor ddx;
   ddx.Resize(dim);
@@ -85,22 +85,22 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   framework::Tensor ddout_actual;
   ddout_actual.mutable_data<T>(dim, place);
   LeakyReluGradGradEachElementFunctor<T> actual_functor(
-      ddx.data<T>(), out.data<T>(), static_cast<T>(alpha),
+      ddx.data<T>(), x.data<T>(), static_cast<T>(alpha),
       ddout_actual.data<T>());
 
-  int64_t limit = out.numel();
+  int64_t limit = x.numel();
 
 #ifdef __NVCC__
   if (platform::is_gpu_place(place)) {
     auto &cuda_dev_ctx = dynamic_cast<platform::CUDADeviceContext &>(dev_ctx);
-    functor(cuda_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
+    functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<platform::CUDADeviceContext> for_range(cuda_dev_ctx,
                                                               limit);
     for_range(actual_functor);
   } else {
 #endif
     auto &cpu_dev_ctx = dynamic_cast<platform::CPUDeviceContext &>(dev_ctx);
-    functor(cpu_dev_ctx, x, &out, &ddx, &ddout, dout, dx);
+    functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
     platform::ForRange<platform::CPUDeviceContext> for_range(cpu_dev_ctx,
                                                              limit);
     for_range(actual_functor);
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da4ca87296d92fc1052f462ae6ee8a3acb05eb49
--- /dev/null
+++ b/paddle/fluid/operators/tile_op.cc
@@ -0,0 +1,265 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/tile_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class TileOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile");
+    auto x_dims = ctx->GetInputDim("X");
+    auto repeat_times = ctx->Attrs().Get<std::vector<int>>("repeat_times");
+    if (repeat_times.size() == 0) {
+      repeat_times = std::vector<int>(x_dims.size(), -1);
+    }
+
+    PADDLE_ENFORCE_LE(
+        x_dims.size(), MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must not be greater than %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, x_dims.size()));
+    PADDLE_ENFORCE_LE(
+        repeat_times.size(), MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The size of the shape of input 'repeat_times' for tile op "
+            "must not be greater than %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times.size()));
+    PADDLE_ENFORCE_GE(
+        repeat_times.size(), 1,
+        platform::errors::InvalidArgument(
+            "The size of the shape of input 'repeat_times' for tile op "
+            "must be positive integers, but the value received is %d.",
+            repeat_times.size()));
+
+    auto out_rank =
+        std::max(static_cast<size_t>(x_dims.size()), repeat_times.size());
+    std::vector<int64_t> out_shape(out_rank);
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    if (x_dim_vec.size() > repeat_times.size()) {
+      auto diff = x_dim_vec.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, -1);
+    } else {
+      auto diff = repeat_times.size() - x_dim_vec.size();
+      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+    }
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      if (x_dim_vec[i] == -1 || repeat_times[i] == -1) {
+        out_shape[i] = -1;
+      } else {
+        PADDLE_ENFORCE_GT(
+            repeat_times[i], 0,
+            platform::errors::InvalidArgument(
+                "Every element of the input 'repeat_times' for tile op must be "
+                "greater than 0, but the value given is %d.",
+                repeat_times[i]));
+        out_shape[i] = x_dim_vec[i] * repeat_times[i];
+      }
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "repeat_times_tensor" || var_name == "RepeatTimes") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class TileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). X is the input to be titled.");
+    AddInput(
+        "RepeatTimes",
+        "(Tensor<int>, optional). If provided, it is the number of repeat times"
+        " along specific axis. It has a higher priority than "
+        "repeat_times_tensor and the repeat_times attribute.")
+        .AsDispensable();
+    AddInput("repeat_times_tensor",
+             "(Tensor Tensor<int>), repeat times for X."
+             "It has a higher priority than repeat_times, but a lower priority "
+             "than RepeatTimes")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "After tiling, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(repeat_times).");
+    AddAttr<std::vector<int>>("repeat_times",
+                              "The number of repeat times for each dimension.")
+        .SetDefault({});
+    AddComment(R"DOC(
+Tile operator repeats the input by given times number. You should set times
+number for each dimension by providing attribute 'repeat_times'. The rank of X
+should be in [1, 6]. Please note that size of 'repeat_times' must be the same
+with X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(repeat_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class TileGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TileGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "TileGrad");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> repeat_times =
+        ctx->Attrs().Get<std::vector<int>>("repeat_times");
+    if (repeat_times.size() == 0) {
+      repeat_times = std::vector<int>(x_dims.size(), -1);
+    }
+
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dim_vec = framework::vectorize<int>(x_dims);
+    if (x_dim_vec.size() > repeat_times.size()) {
+      auto diff = x_dim_vec.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, -1);
+    } else {
+      auto diff = repeat_times.size() - x_dim_vec.size();
+      x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
+    }
+
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      if (repeat_times[i] == -1 || x_dim_vec[i] == -1) {
+        continue;
+      } else {
+        if (ctx->IsRuntime()) {
+          PADDLE_ENFORCE_EQ(
+              x_dim_vec[i] * repeat_times[i], out_dims[i],
+              platform::errors::InvalidArgument(
+                  "The size (%d) of the dimension %d of Input(Out@GRAD) should "
+                  "be equal to the multiplication of the crroresponding "
+                  "dimension size of Input(X) (%d) and repeat_times (%d).",
+                  out_dims[i], i, x_dim_vec[i], repeat_times[i]));
+        }
+      }
+    }
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "repeat_times_tensor" || var_name == "RepeatTimes") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class TileGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tile_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor"));
+    op->SetInput("RepeatTimes", this->Input("RepeatTimes"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
+                  ops::TileGradOpMaker<paddle::framework::OpDesc>,
+                  ops::TileGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
+                  ops::TileGradNoNeedBufVarsInferer);
+REGISTER_OP_CPU_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CPUDeviceContext, bool>);
+REGISTER_OP_CPU_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tile_op.cu b/paddle/fluid/operators/tile_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ca82cd6a1f43551cb4d461bc47e962abd097a9a
--- /dev/null
+++ b/paddle/fluid/operators/tile_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/tile_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6b0fdd720cf4be79dc403a53341b18366998a67
--- /dev/null
+++ b/paddle/fluid/operators/tile_op.h
@@ -0,0 +1,274 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define TILE_TEMPLATE(z, n, data) \
+  case n + 1: {                   \
+    Tile<n + 1>(context);         \
+    break;                        \
+  }
+#define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~)
+#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define TILE_GRAD_CASE(n)                                        \
+  case n: {                                                      \
+    TileBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                       \
+  }
+#define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), )
+#define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+inline std::vector<int> get_repeat_times(
+    const framework::ExecutionContext& ctx) {
+  if (ctx.HasInput("RepeatTimes")) {
+    auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
+    auto* repeat_data = repeat_tensor->data<int>();
+    framework::Tensor cpu_repeat_tensor;
+    if (platform::is_gpu_place(repeat_tensor->place())) {
+      TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
+      repeat_data = cpu_repeat_tensor.data<int>();
+    }
+    auto vec_repeat_times =
+        std::vector<int>(repeat_data, repeat_data + repeat_tensor->numel());
+    return vec_repeat_times;
+  }
+
+  auto list_repeat_times_tensor =
+      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+  if (list_repeat_times_tensor.size() > 0) {
+    // get tensor from
+    std::vector<int> vec_repeat_times;
+    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
+      auto tensor = list_repeat_times_tensor[i];
+      if (platform::is_gpu_place(tensor->place())) {
+        framework::Tensor temp;
+        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_repeat_times.push_back(*temp.data<int32_t>());
+      } else {
+        vec_repeat_times.push_back(*tensor->data<int32_t>());
+      }
+    }
+    return vec_repeat_times;
+  } else {
+    return ctx.Attr<std::vector<int>>("repeat_times");
+  }
+}
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::To32BitIndex;
+
+template <typename DeviceContext, typename T>
+class TileKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1, platform::errors::InvalidArgument(
+                     "The rank of the input 'x' for tile op must be a positive "
+                     "integer, but the value received is %d.",
+                     rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto repeat_times = get_repeat_times(context);
+    int repeat_times_size = repeat_times.size();
+    PADDLE_ENFORCE_GE(
+        repeat_times_size, 1,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile "
+            "op must be positive, but the value received is %d.",
+            repeat_times_size));
+    PADDLE_ENFORCE_LE(
+        repeat_times_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times_size));
+    rank = std::max(rank, repeat_times_size);
+    switch (rank) { REP_TILE_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Tile(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+
+    auto in_dims = in0->dims();
+    auto repeat_times = get_repeat_times(context);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          repeat_times[i], 0,
+          platform::errors::InvalidArgument(
+              "All elements of the input 'repeat_times' for tile op must "
+              "be positive integers, but the value received is %d.",
+              repeat_times[i]));
+    }
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    PADDLE_ENFORCE_EQ(
+        repeat_times.size(), vec_in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The rank (%d) of the input 'x' and the rank (%d) of the input "
+            "'repeat_times' for tile op must match after promotion.",
+            vec_in_dims.size(), repeat_times.size()));
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      bcast_dims[i] = repeat_times[i];
+    }
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+
+    out0->Resize(out_dims);
+    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    // use 32-bit index to speed up
+    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
+    if (use_32bit_index) {
+      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+    } else {
+      y.device(place) = x.broadcast(bcast_dims);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TileGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto repeat_times = get_repeat_times(context);
+    auto x_dims = in0->dims();
+    auto vec_in_dims = framework::vectorize<int>(x_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    // 1. reshape_dims_vec is the broadcast parameter.
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      reduce_dims_vec.push_back(reshape_dims_vec.size());
+      reshape_dims_vec.push_back(repeat_times[i]);
+      reshape_dims_vec.push_back(vec_in_dims[i]);
+    }
+
+    int dims = reduce_dims_vec.size();
+
+    bool just_copy = true;
+    for (size_t i = 0; i < repeat_times.size(); i++) {
+      if (repeat_times[i] != 1) {
+        just_copy = false;
+        break;
+      }
+    }
+    // no need reduce, just copy
+    if (just_copy) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
+    } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "Th rank of the input 'Out@GRAD' for tile_grad op "
+                            " must be greater than or equal to 1, but "
+                            "the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for tile_grad op "
+                            "must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
+      switch (dims) { REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void TileBackward(const framework::ExecutionContext& context,
+                    const std::vector<int>& reshape_dims_vec,
+                    const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = reshape_dims_vec.size();
+    size_t reduce_size = reduce_dims_vec.size();
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims)
+            .sum(reduce_dims)
+            .reshape(x_grad.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..57891699fd2ad73a1cccce26438528657afdf340
--- /dev/null
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdio.h>
+#include <cstdio>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<paddle::platform::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
+                 paddle::platform::float16> {};
+}  // namespace cub
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+// Iter using into a column
+struct ColumnIndexIter {
+  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+inline static int GetDesiredBlockDim(int dim) {
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+template <typename T>
+__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
+    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator>(const T value) const {
+    return (v > value);
+  }
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size, const bool& largest) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (largest) {
+      if (topk[k] < p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    } else {
+      if (topk[k] > p) {
+        topk[k + 1] = topk[k];
+      } else {
+        topk[k + 1] = p;
+        return;
+      }
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size,
+                                        const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        AddTo<T>(topk, tmp, beam_size, largest);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size, const bool& largest) {
+  while (idx < dim) {
+    if (largest) {
+      if (topk[beam_size - 1] < src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp < max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    } else {
+      if (topk[beam_size - 1] > src[idx]) {
+        Pair<T> tmp(src[idx], idx);
+        if (tmp > max) {
+          AddTo<T>(topk, tmp, beam_size, largest);
+        }
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
+                                              int beam_size, const T* src,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
+                                              const int tid, bool largest) {
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length, largest);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
+        } else {
+          topk[k].set(-static_cast<T>(INFINITY), -1);
+        }
+      }
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
+                              length, largest);
+      }
+    }
+
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
+    *beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int* beam, int* k,
+                                            const int tid, const int warp,
+                                            const bool& largest) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (largest) {
+        if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      } else {
+        if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
+          maxid[tid] = tid + BlockSize / 2;
+        } else {
+          maxid[tid] = tid;
+        }
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (largest) {
+          if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        } else {
+          if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
+            maxid[tid] = maxid[tid + stride];
+          }
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
+      }
+    }
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (maxid[0] / 32 == warp) {
+      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
+          MaxLength)
+        break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k,
+                             int grid_dim, int num, bool largest = true) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+
+  const int bid = blockIdx.x;
+  for (int i = bid; i < num; i += grid_dim) {
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
+    Pair<T> topk[MaxLength];
+    int beam = MaxLength;
+    Pair<T> max;
+    bool is_empty = false;
+    bool firststep = true;
+
+    for (int j = 0; j < MaxLength; j++) {
+      if (largest) {
+        topk[j].set(-static_cast<T>(INFINITY), -1);
+      } else {
+        topk[j].set(static_cast<T>(INFINITY), -1);
+      }
+    }
+    while (top_num) {
+      ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, src + i * lds,
+                                             &firststep, &is_empty, &max, dim,
+                                             tid, largest);
+
+      sh_topk[tid] = topk[0];
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp, largest);
+    }
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
+                           size_t rows, size_t cols, size_t k) {
+  for (size_t i = 0; i < rows; ++i) {
+    for (size_t j = 0; j < cols; ++j) {
+      x_grad[i * cols + j] = 0;
+    }
+    for (size_t j = 0; j < k; ++j) {
+      size_t idx = indices[i * k + j];
+      x_grad[i * cols + idx] = out_grad[i * k + j];
+    }
+  }
+}
+
+// the grad assign with the axis
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
+                                   T* grad_in, int pre, int post,
+                                   int raw_height, int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    const int& base_index = i * post * k;
+    const int& base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      const int64_t idx_ij = indices[base_index + j];
+      const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[idx_ij];
+    }
+  }
+}
+// use the radix sort for the topk
+template <typename T>
+bool SortTopk(const platform::CUDADeviceContext& ctx,
+              const framework::Tensor* input_tensor, const int64_t num_cols,
+              const int64_t num_rows, const int k,
+              framework::Tensor* out_tensor, framework::Tensor* indices_tensor,
+              bool largest = true) {
+  auto cu_stream = ctx.stream();
+
+  Tensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = framework::make_ddim(dims);
+  input_indices.Resize(dim);
+  // input_indices.Resize(num_rows*num_cols);
+  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(num_cols);
+
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  // actually, int num_rows < max_grid_size
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  // Init a index array
+  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+
+  // create iter for counting input
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+
+  Tensor temp_values;
+  Tensor temp_indices;
+
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+
+  if (k == num_cols) {
+    // Doing a full sort.
+    sorted_values_ptr = values;
+    sorted_indices_ptr = indices;
+  } else {
+    temp_values.Resize(dim);
+    temp_indices.Resize(dim);
+    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+  }
+
+  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
+  // time.
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR)
+          << "TopKOP failed as could not launch "
+             "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+             "temp_storage_bytes, status: "
+          << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        nullptr, temp_storage_bytes, input, sorted_values_ptr,
+        input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+        cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to calculate "
+                    "temp_storage_bytes, status: "
+                 << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  Tensor temp_storage;
+  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+
+  if (largest) {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  } else {
+    auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+        temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+        sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+        num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+        0, sizeof(T) * 8, cu_stream);
+    if (err != cudaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "cub::DeviceSegmentedRadixSort::SortPairs to "
+                    "sort input, "
+                    "temp_storage_bytes: "
+                 << temp_storage_bytes
+                 << ", status: " << cudaGetErrorString(err);
+      return false;
+    }
+  }
+  auto& dev = *ctx.eigen_device();
+  if (k < num_cols) {
+    // copy sliced data to output.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+
+    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
+    auto dim = framework::make_ddim(odims);
+    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+
+    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
+    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+  }
+  return true;
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 82ecc2887ba240560cf15165f21bc995f4683159..d8b2e92616091a8c822c6fd0bfdfb1148c25534d 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
 #include <cstdio>
+#include <vector>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 // set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
-
-  __device__ __forceinline__ void set(T value, int64_t id) {
-    v = value;
-    id = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair<T>& in) {
-    v = in.v;
-    id = in.id;
-  }
-
-  __device__ __forceinline__ bool operator<(const T value) const {
-    return (v < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
-    return (v < in.v) || ((v == in.v) && (id > in.id));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
-    return (v > in.v) || ((v == in.v) && (id < in.id));
-  }
-
-  T v;
-  int64_t id;
-};
-
-template <typename T>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
-                                      int beam_size) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int beam_size>
-__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
-  for (int k = beam_size - 2; k >= 0; k--) {
-    if (topk[k] < p) {
-      topk[k + 1] = topk[k];
-    } else {
-      topk[k + 1] = p;
-      return;
-    }
-  }
-  topk[0] = p;
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
-                                        int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < src[idx]) {
-      Pair<T> tmp(src[idx], idx);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      AddTo<T>(topk, tmp, beam_size);
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int BlockSize>
-__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
-                                        int idx, int dim, const Pair<T>& max,
-                                        int beam_size) {
-  while (idx < dim) {
-    if (topk[beam_size - 1] < val[idx]) {
-      Pair<T> tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        AddTo<T>(topk, tmp, beam_size);
-      }
-    }
-    idx += BlockSize;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* src,
-                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>* max, int dim,
-                                              const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
-                                              int beam_size, const T* val,
-                                              int* col, bool* firstStep,
-                                              bool* is_empty, Pair<T>* max,
-                                              int dim, const int tid) {
-  if (*beam > 0) {
-    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (*firstStep) {
-      *firstStep = false;
-      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + *beam];
-        } else {
-          topk[k].set(-static_cast<T>(INFINITY), -1);
-        }
-      }
-      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
-                              length);
-      }
-    }
-
-    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
-    *beam = 0;
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
-                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int* beam, int* k,
-                                            const int tid, const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < BlockSize / 2) {
-      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
-        maxid[tid] = tid + BlockSize / 2;
-      } else {
-        maxid[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
-          maxid[tid] = maxid[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = sh_topk[maxid[0]].v;
-      **topIds = sh_topk[maxid[0]].id;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxid[0]) (*beam)++;
-    if (--(*k) == 0) break;
-    __syncthreads();
-
-    if (tid == maxid[0]) {
-      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[*beam];
-      }
-    }
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (maxid[0] / 32 == warp) {
-      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
-          MaxLength)
-        break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top MaxLength value;
- * 2. merge to sh_topk, block reduce and get max value;
- * 3. go to the second setp, until one thread's topk value is null;
- * 4. go to the first setp, until get the topk value.
- */
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
-  __shared__ Pair<T> sh_topk[BlockSize];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-static_cast<T>(INFINITY), -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
-
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
-  }
-}
-
-template <typename T, int MaxLength, int BlockSize>
-__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
-                           size_t rows, size_t cols, size_t k) {
-  for (size_t i = 0; i < rows; ++i) {
-    for (size_t j = 0; j < cols; ++j) {
-      x_grad[i * cols + j] = 0;
-    }
-    for (size_t j = 0; j < k; ++j) {
-      size_t idx = indices[i * k + j];
-      x_grad[i * cols + idx] = out_grad[i * k + j];
-    }
-  }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-// Iter using into a column
-struct ColumnIndexIter {
-  explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
-      const Eigen::array<int, 1>& ix) const {
-    return ix[0] % num_cols_;
-  }
-
-  int num_cols_;
-};
-
-__global__ void InitIndex(int64_t* indices, int64_t num_rows,
-                          int64_t num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
-    for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T>
-bool SortTopk(const platform::CUDADeviceContext& ctx,
-              const framework::Tensor* input_tensor, const int64_t num_cols,
-              const int64_t num_rows, const int k,
-              framework::Tensor* out_tensor,
-              framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = framework::make_ddim(dims);
-  input_indices.Resize(dim);
-  // input_indices.Resize(num_rows*num_cols);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](int col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
-  // actually, int num_rows < max_grid_size
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  // Init a index array
-  InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-
-  // create iter for counting input
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-
-  Tensor temp_values;
-  Tensor temp_indices;
-
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-
-  if (k == num_cols) {
-    // Doing a full sort.
-    sorted_values_ptr = values;
-    sorted_indices_ptr = indices;
-  } else {
-    temp_values.Resize(dim);
-    temp_indices.Resize(dim);
-    sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-    sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  }
-
-  // Get temp storage buffer size, maybe can allocate a fixed buffer to save
-  // time.
-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
-           "temp_storage_bytes, status: "
-        << cudaGetErrorString(err);
-    return false;
-  }
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-  if (err != cudaSuccess) {
-    LOG(ERROR)
-        << "TopKOP failed as could not launch "
-           "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
-           "temp_storage_bytes: "
-        << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-  auto& dev = *ctx.eigen_device();
-  if (k < num_cols) {
-    // copy sliced data to output.
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
-    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
-
-    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
-    auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
-
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
-  }
-  return true;
-}
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
     constexpr auto kBlockDim = (dim);  \
@@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
         framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const int64_t input_width = inputdims[inputdims.size() - 1];
     const auto& dev_ctx = ctx.cuda_device_context();
-
     if ((input_width <= 1024 || k >= 128 || k == input_width)) {
       if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
                       indices)) {
@@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
         framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
     const size_t col = xdims[xdims.size() - 1];
     const auto& dev_ctx = context.cuda_device_context();
-
     const int kMaxHeight = 2048;
     int gridx = row < kMaxHeight ? row : kMaxHeight;
     switch (GetDesiredBlockDim(col)) {
@@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
     top_k,
     paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc72d83411f5a34561a75e7e75f98077ee5a4e5d
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class TopkV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
+                      "the axis of topk"
+                      "must be [-%d, %d), but you set axis is %d",
+                      dim_size, dim_size, axis);
+
+    if (axis < 0) axis += dim_size;
+
+    PADDLE_ENFORCE_GE(
+        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      "input of topk must have >= 1d shape");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(
+          input_dims[axis], k,
+          "input of topk op must have >= %d columns in axis of %d", k, axis);
+    }
+
+    framework::DDim dims = input_dims;
+
+    dims[axis] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the tensor).")
+        .SetDefault(1);
+    AddAttr<int>("axis",
+                 "the axis to sort and get the k indices, value."
+                 "if not set, will get k value in last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("largest",
+                  "control flag whether to return largest or smallest")
+        .SetDefault(true);
+    AddAttr<bool>("sorted",
+                  "control flag whether to return elements in sorted order")
+        .SetDefault(true);
+  }
+};
+
+class TopkV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("top_k_v2_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
+                  ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(top_k_v2,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
+
+REGISTER_OP_CPU_KERNEL(
+    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5154503292014fa43efae919f20d731060b9db57
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.cu
@@ -0,0 +1,272 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/p_norm_op.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename DeviceContext, typename T>
+class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+
+    // get the attributes
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
+    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    const auto& out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      // if get the topK from the last axis
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
+                        indices, largest)) {
+          // Successed, return.
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      // NOTE: pass lds and dim same to input width.
+      // NOTE: old matrix implementation of stride is different to eigen.
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                output_data, k, indices_data, input_data, input_width,
+                input_width, static_cast<int>(k), gridx, input_height,
+                largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+    } else {
+      // if get topK not from the last axis, will tranpose the tensor and get
+      // TopK
+
+      // first step, prepare the trans args for the tranpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+      // second step, tranpose the input
+      Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      // third step, calcluate the topk
+      // allocate the tmp cuda memory for the tmp result
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      Tensor trans_out;
+      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      if (k > input_width) k = input_width;
+
+      if ((input_width <= 1024 || k >= 128 || k == input_width)) {
+        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
+                        &trans_out, &trans_ind, largest)) {
+          // last step, tranpose back the indices and output
+          TransCompute<platform::CUDADeviceContext, int64_t>(
+              ndims, dev_ctx, trans_ind, indices, trans);
+          TransCompute<platform::CUDADeviceContext, T>(
+              ndims, dev_ctx, trans_out, output, trans);
+          return;
+        } else {
+          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                       "default topk kernel.";
+        }
+      }
+
+      const int kMaxHeight = 2048;
+      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+      switch (GetDesiredBlockDim(input_width)) {
+        FIXED_BLOCK_DIM(
+            KeMatrixTopK<T, 5,
+                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
+                trans_input.data<T>(), input_width, input_width,
+                static_cast<int>(k), gridx, input_height, largest));
+        default:
+          PADDLE_THROW(platform::errors::Fatal(
+              "the input data shape has error in the topk cuda kernel."));
+      }
+
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+    }
+  }
+};
+
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+template <typename DeviceContext, typename T>
+class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // get the real the axis and the k
+    if (axis < 0) axis += in_dims.size();
+    const int& k = out_dims[axis];
+    const int& raw_height = in_dims[axis];
+
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    auto ComputeBlockSize = [](int col) {
+      if (col > 512)
+        return 1024;
+      else if (col > 256 && col <= 512)
+        return 512;
+      else if (col > 128 && col <= 256)
+        return 256;
+      else if (col > 64 && col <= 128)
+        return 128;
+      else
+        return 64;
+    };
+    int block_size = ComputeBlockSize(post * k);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+
+    // lanuch the cuda kernel to assign the grad
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          float>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          double>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>,
+    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                          paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
+                       paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, int64_t>,
+    paddle::operators::TopkV2OpGradCUDAKernel<
+        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a77285d123644e6ea2b9077f3338b92add42f7f0
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -0,0 +1,321 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+  The reason why we need the topk v2 is because the compatibility. We redefine
+  the NaN is maximum value
+  in the process of comparing. If do not add the topk v2,  will affect the
+  inference result of model that traing
+  by the older version paddlepaddle.
+*/
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height, Type input_width, int input_dim,
+                     const framework::Tensor* input, T* t_out, Type* t_indices,
+                     const int& k, const bool& largest, const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(), col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height, const Type& input_width,
+                           const int& input_dim, const framework::Tensor* input,
+                           const framework::Tensor* indices, T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TopkV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Get the top k elements of each row of input tensor
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    int k = static_cast<int>(context.Attr<int>("k"));
+    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    // axis < 0, cacluate the real axis
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    // if K tensor is not null, will the use K tesnor as k
+    auto* k_t = context.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      // accroding to axis to set K value in the dim
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    const auto& out_dims = output->dims();
+    if (axis + 1 == in_dims.size()) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                           output_data, indices_data, k, largest, sorted);
+    } else {
+      // if the topk dims is not last dim, will tranpose and do topk
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.push_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      // get the trans input_dims, out_dims
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+      }
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_out_dims[i] = out_dims[trans[i]];
+      }
+
+      Tensor trans_inp;
+      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // transpose the input value
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_inp, trans);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+      // Allocate the temp tensor to the save the topk indices, values
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
+      Tensor tmp_indices;
+      auto* t_ind =
+          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
+
+      // get the TopK value
+      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
+                           &trans_inp, t_out, t_ind, k, largest, sorted);
+      // transpose back
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TopkV2GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<Tensor>("Indices");
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+
+    const auto& in_dims = x->dims();
+    const auto& out_dims = indices->dims();
+
+    // axis < 0, get the real axis
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    const size_t& k = out_dims[axis];
+
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis + 1 == in_dims.size()) {
+      // allocate the memory for the input_grad
+
+      // assign the out_grad to input_grad directly
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+
+      // init the output grad with 0, because some input elements has no grad
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      // Assign the output_grad to input_grad
+      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
+                     indices, x_grad_data, k);
+    } else {
+      // can not assign grad to input_grad, must do the transpose
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      framework::DDim trans_dims(out_dims);
+      framework::DDim trans_in_dims(in_dims);
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = out_dims[trans[i]];
+        trans_in_dims[i] = in_dims[trans[i]];
+      }
+      // transpose the out_grad, indices
+      Tensor trans_dO;
+      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
+      Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // Do transpose
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
+                                                  &trans_dO, trans);
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, *indices, &trans_ind, trans);
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+      // Assign the out_grad to tranpose input_grad
+      Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                                 &trans_dO, &trans_ind, t_out, k);
+
+      // Transpose back
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 6849bd739501cc73e7142e6462a3f627f445bd22..946fa6305d737363dab3cea2e2b581f2e5659cfd 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -108,13 +108,17 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
     AddComment(R"DOC(
 Transpose Operator.
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 9e158abba747d124c83e0366b9c0c5845c49e183..3aa9ff544af63993521d41604cecef0b283ebc1e 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <limits>
 #include <random>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -161,18 +162,27 @@ class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
     std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
                                            1.0);
     TruncatedNormal<T> truncated_normal(mean, std);
     int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(engine));
+
+    if (framework::Generator::GetInstance()->is_init_py) {
+      std::mt19937_64& gen_engine =
+          framework::Generator::GetInstance()->GetCPUEngine();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = truncated_normal(dist(gen_engine));
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = truncated_normal(dist(engine));
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index e0c56307639afeb70e5cc45a4022996cef52a475..a4487cde277990a725fd4c37b6d807278e314343 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/uniform_random_op.h"
 #include <string>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+
 namespace paddle {
 namespace operators {
 
@@ -55,19 +57,40 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           "supports SelectedRows and LoDTensor");
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
+
+    int64_t size = tensor->numel();
     std::uniform_real_distribution<T> dist(
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
+    auto gen_ptr = framework::Generator::GetInstance();
+    if (gen_ptr->is_init_py) {
+      std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
+      // auto gen_engine = gen_ptr_->GetCPUEngine();
+      // std::uniform_real_distribution<T> dist(
+      //    static_cast<T>(ctx.Attr<float>("min")),
+      //    static_cast<T>(ctx.Attr<float>("max")));
+
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(gen_engine);
+      }
+    } else {
+      unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+      std::minstd_rand engine;
+      if (seed == 0) {
+        seed = std::random_device()();
+      }
+      engine.seed(seed);
+      // std::uniform_real_distribution<T> dist(
+      //    static_cast<T>(ctx.Attr<float>("min")),
+      //    static_cast<T>(ctx.Attr<float>("max")));
+      // int64_t size = tensor->numel();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
     }
+    // std::mt19937_64 &engine = gen_ptr->GetCPUEngine();
+    // auto engine = gen_ptr_->GetCPUEngine();
+
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
     unsigned int diag_step =
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 53c79cf672e7d71ea2e7202f624a0110cc6ce41d..c024bb87b09c00c34dbaaf7b747f29743152502f 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
@@ -87,9 +88,14 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
+    if (framework::Generator::GetInstance()->is_init_py) {
+      seed = static_cast<unsigned int>(
+          framework::Generator::GetInstance()->GetCurrentSeed());
+    } else {
+      if (seed == 0) {
+        std::random_device rd;
+        seed = rd();
+      }
     }
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 867b10441640c63fec9018363a59d29ac52c8743..d263dd03dd0de0d1b12925d0c3ec428b6730ef2e 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index c141033b2b3e6b9fdeac88610dd1362ba8f98428..1aea96a15eb090ccd1a508641e4c6c0a8dcf7fb9 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -24,17 +24,63 @@ class UniqueOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-
     auto in_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 1,
-        platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
-                                          "But now the dims of Input(X) is %d.",
-                                          in_dims.size()));
+    if (!ctx->Attrs().Get<bool>("is_sorted")) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+      PADDLE_ENFORCE_EQ(in_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The Input(X) should be 1-D Tensor, "
+                            "But now the dims of Input(X) is %d.",
+                            in_dims.size()));
+
+      ctx->SetOutputDim("Out", {-1});
+      ctx->SetOutputDim("Index", in_dims);
+      return;
+    }
+
+    bool return_index = ctx->Attrs().Get<bool>("return_index");
+    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
+    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
+    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
+
+    if (return_index) {
+      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
+    }
+    if (return_inverse) {
+      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
+    }
+    if (return_counts) {
+      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
+    }
 
-    ctx->SetOutputDim("Out", {-1});
-    ctx->SetOutputDim("Index", in_dims);
+    if (axis_vec.empty()) {
+      ctx->SetOutputDim("Out", {-1});
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {framework::product(in_dims)});
+      }
+    } else {
+      int axis = axis_vec[0];
+      if (axis < 0) {
+        axis += in_dims.size();
+      }
+      PADDLE_ENFORCE_LT(
+          axis, in_dims.size(),
+          platform::errors::InvalidArgument("The axis(%d) should be less than "
+                                            "the dimension size(%d) of x.",
+                                            axis, in_dims.size()));
+      auto out_dims = in_dims;
+      out_dims[axis] = -1;
+      ctx->SetOutputDim("Out", out_dims);
+      if (return_inverse) {
+        ctx->SetOutputDim("Index", {in_dims[axis]});
+      }
+    }
+    if (return_index) {
+      ctx->SetOutputDim("Indices", {-1});
+    }
+    if (return_counts) {
+      ctx->SetOutputDim("Counts", {-1});
+    }
   }
 
  protected:
@@ -49,14 +95,47 @@ class UniqueOp : public framework::OperatorWithKernel {
 class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input tensor. It should be a 1-D tensor.");
+    AddInput("X",
+             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
+             " is fasle or a N-D tensor when Attr(is_sorted) is true.");
     AddAttr<int>("dtype", "data type for output index");
     AddOutput("Out", "A unique subsequence for input tensor.");
     AddOutput("Index",
-              "An index tensor pointing to unique subsequence, which has "
-              "identical shape with input tensor and int64 dtype.");
+              "Equivalent to inverse in numpy.unique, "
+              "the indices for where elements in the original input ended up "
+              "in the returned unique tensor.");
+    AddOutput(
+        "Indices",
+        "The indices of the input tensor that result in the unique tensor.")
+        .AsDispensable();
+    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
+    AddAttr<bool>("return_index",
+                  "If True, also return the indices of the input"
+                  " tensor that result in the unique Tensor.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "return_inverse",
+        "If True, also return the indices for where elements"
+        " in the original input ended up in the returned unique tensor.")
+        .SetDefault(false);
+    AddAttr<bool>("return_counts",
+                  "If True, also return the counts for each unique element.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "axis",
+        "The axis to apply unique. If None, the input will be flattened.")
+        .SetDefault({});
+    AddAttr<bool>("is_sorted",
+                  "If True, the unique elements of X are in ascending order."
+                  "Otherwise, the unique elements are not sorted.")
+        .SetDefault(false);
     AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
+    1. Return a unique subsequence for 1-D input tensor, and an index tensor
+    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    means paddle.unique is called.
+    
+    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
+    is true. This means fluid.layers.unique is called.
 )DOC");
   }
 };
@@ -65,6 +144,8 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
-REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
-                       ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
-                       ops::UniqueKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index cdfd797cbfdf87a42ce0834eea9467010a058431..dc8b2ac5555126d8cf2bb92d2f506b1bf358e680 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <cmath>
+#include <numeric>
+#include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
@@ -104,17 +109,243 @@ struct UniqueOpFunctor {
   }
 };
 
+static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<framework::Tensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T>
+static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
+  if (a.numel() != b.numel()) {
+    return false;
+  }
+  for (int64_t i = 0; i < a.numel(); ++i) {
+    if (a.data<T>()[i] != b.data<T>()[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 template <typename T>
+static void UniqueFlattendTensor(const framework::ExecutionContext& context,
+                                 const framework::Tensor& in,
+                                 framework::Tensor* out, bool return_index,
+                                 bool return_inverse, bool return_counts) {
+  const T* in_data = in.data<T>();
+  std::set<T> unique(in_data, in_data + in.numel());
+  out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
+  auto out_data = out->mutable_data<T>(context.GetPlace());
+  std::copy(unique.begin(), unique.end(), out_data);
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    indices->Resize(framework::make_ddim({out->numel()}));
+    auto indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> indices_map;
+    indices_map.reserve(out->numel());
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      if (indices_map.find(in_data[i]) != indices_map.end()) continue;
+      indices_map[in_data[i]] = i;
+    }
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      indices_data[i] = indices_map[out_data[i]];
+    }
+  }
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    inverse->Resize(framework::make_ddim({in.numel()}));
+    auto inverse_data = inverse->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> inverse_map;
+    inverse_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      inverse_map[out_data[i]] = i;
+    }
+    for (int64_t i = 0; i < in.numel(); ++i) {
+      inverse_data[i] = inverse_map[in_data[i]];
+    }
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    count->Resize(framework::make_ddim({out->numel()}));
+    auto count_data = count->mutable_data<int64_t>(context.GetPlace());
+    std::unordered_map<T, int64_t> counts_map;
+    counts_map.reserve(out->numel());
+    for (int64_t i = 0; i < out->numel(); ++i) {
+      counts_map[out_data[i]] = 0;
+    }
+    for (int64_t i = 0; i < in.numel(); i++) {
+      counts_map[in_data[i]] += 1;
+    }
+    for (int64_t i = 0; i < out->numel(); i++) {
+      count_data[i] = counts_map[out_data[i]];
+    }
+  }
+}
+
+template <class ForwardIt, typename T>
+static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
+                               ForwardIt first, ForwardIt last,
+                               const std::vector<int64_t>& sorted_indices_vec,
+                               std::vector<int64_t>* inverse_vec,
+                               std::vector<int64_t>* counts_vec,
+                               std::vector<int64_t>* indices_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+  (*indices_vec)[0] = sorted_indices_vec[0];
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!Equal<T>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+      (*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename DeviceContext, typename T>
+static void UniqueDim(const framework::ExecutionContext& context,
+                      const framework::Tensor& in, framework::Tensor* out,
+                      bool return_index, bool return_inverse,
+                      bool return_counts, int axis) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(framework::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  framework::Tensor in_trans;
+  framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  in_trans.mutable_data<T>(context.GetPlace());
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  TransCompute<DeviceContext, T>(in.dims().size(), dev_ctx, in, &in_trans,
+                                 permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  framework::DDim in_trans_flat_dims =
+      framework::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  // sort indices
+  std::vector<int64_t> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const T* in_trans_data = in_trans.data<T>();
+  std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
+            [&](int64_t a, int64_t b) -> bool {
+              for (int64_t i = 0; i < col; ++i) {
+                T lhs = in_trans_data[i + a * col];
+                T rhs = in_trans_data[i + b * col];
+                if (lhs < rhs) {
+                  return true;
+                } else if (lhs > rhs) {
+                  return false;
+                }
+              }
+              return false;
+            });
+
+  // sort tensor according to indices
+  framework::Tensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  input_sorted.mutable_data<T>(context.GetPlace());
+  T* input_sorted_data = input_sorted.data<T>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + sorted_indices_vec[i] * col, col * sizeof(T));
+  }
+
+  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<int64_t> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<int64_t> counts_vec(sorted_indices_vec.size(), 0);
+  std::vector<int64_t> indices_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, T>(
+      context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
+      &inverse_vec, &counts_vec, &indices_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+  indices_vec.erase(indices_vec.begin() + input_unbind.size(),
+                    indices_vec.end());
+
+  math::ConcatFunctor<DeviceContext, T> concat_functor;
+  framework::Tensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
+  out_trans.mutable_data<T>(context.GetPlace());
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(framework::make_ddim(out_trans_dims_vec));
+  out->mutable_data<T>(context.GetPlace());
+  concat_functor(dev_ctx, input_unbind, 0, &out_trans);
+  TransCompute<DeviceContext, T>(out_trans.dims().size(), dev_ctx, out_trans,
+                                 out, permute);
+
+  if (return_inverse) {
+    auto* inverse = context.Output<framework::Tensor>("Index");
+    framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
+  }
+
+  if (return_counts) {
+    auto* count = context.Output<framework::Tensor>("Counts");
+    framework::TensorFromVector(counts_vec, context.device_context(), count);
+  }
+
+  if (return_index) {
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    framework::TensorFromVector(indices_vec, context.device_context(), indices);
+  }
+}
+
+template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
+    if (!context.Attr<bool>("is_sorted")) {
+      auto data_type = static_cast<framework::proto::VarType::Type>(
+          context.Attr<int>("dtype"));
+      auto* index = context.Output<framework::Tensor>("Index");
+
+      framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+      return;
+    }
 
-    framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
+    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
+    bool return_index = context.Attr<bool>("return_index");
+    bool return_inverse = context.Attr<bool>("return_inverse");
+    bool return_counts = context.Attr<bool>("return_counts");
+
+    if (axis_vec.empty()) {
+      UniqueFlattendTensor<T>(context, *x, out, return_index, return_inverse,
+                              return_counts);
+    } else {
+      int axis = axis_vec[0];
+      UniqueDim<DeviceContext, T>(context, *x, out, return_index,
+                                  return_inverse, return_counts, axis);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/xpu/mul_xpu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79aae71c3045f938f4b8f0d3e05ce7cf358c41ea
--- /dev/null
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
@@ -0,0 +1,183 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    bool trans_a = false;
+    bool trans_b = false;
+    int m = x_matrix.dims()[0];
+    int k = x_matrix.dims()[1];
+    int k1 = y_matrix.dims()[0];
+    int n = y_matrix.dims()[1];
+    PADDLE_ENFORCE_EQ(
+        k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    const T* data_a = x_matrix.data<T>();
+    const T* data_b = y_matrix.data<T>();
+    T* data_c = z->data<T>();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
+                            alpha, data_a, data_b, beta, data_c);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    Tensor dout_mat;
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+      bool trans_a = false;
+      bool trans_b = true;
+      int m = dout_mat.dims()[0];
+      int k = dout_mat.dims()[1];
+      int n = y_matrix.dims()[0];
+      int k1 = y_matrix.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = dout->data<T>();
+      const T* data_b = y_matrix.data<T>();
+      T* data_c = dx_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+      bool trans_a = true;
+      bool trans_b = false;
+      int k = x_matrix.dims()[0];
+      int m = x_matrix.dims()[1];
+      int k1 = dout_mat.dims()[0];
+      int n = dout_mat.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = x_matrix.data<T>();
+      const T* data_b = dout->data<T>();
+      T* data_c = dy_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
+#endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 5a100c5746e616e860811dd47da27036ea7355d5..652b4dd47daa8aecdcae43e8c910d7dd61bbb64d 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -4,6 +4,12 @@ if(WITH_GPU)
   proto_library(cuda_error_proto SRCS cuda_error.proto)
 endif(WITH_GPU)
 
+if(WITH_XPU)
+  set(XPU_CTX_DEPS xpulib)
+ELSE()
+  set(XPU_CTX_DEPS)
+endif(WITH_XPU)
+
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -45,11 +51,15 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
+if(WITH_XPU)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
@@ -78,13 +88,17 @@ ELSE()
   set(STREAM_CALLBACK_DEPS)
 ENDIF()
 
+if(WITH_GLOO)
+    cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce)
+endif()
+
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper)
+    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 38b0894c3f71dc150a9ed737b0ac17b22baffb8a..29982c13c8ca88bc8b4a168f92e4116a283a97e8 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU option or check that your train process hold the "
+        "with WITH_GPU or WITH_XPU option or check that your train process "
+        "hold the "
         "correct gpu_id if you use Executor.",
         place));
   }
@@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported. Please re-compile with WITH_GPU "
           "option."));
+#endif
+    } else if (platform::is_xpu_place(p)) {
+#ifdef PADDLE_WITH_XPU
+      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("XPUPlace is not supported. Please "
+                                          "re-compile with WITH_XPU option."));
 #endif
     }
   }
@@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
+#ifdef PADDLE_WITH_XPU
+XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+
+XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  context_ = xpu::create_context();
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+void XPUDeviceContext::Wait() const {
+  int ret = xpu_set_device(place_.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_wait();
+}
+
+Place XPUDeviceContext::GetPlace() const { return place_; }
+
+xpu::Context* XPUDeviceContext::x_context() const { return context_; }
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
@@ -412,9 +464,21 @@ MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
   return cur_paddle_data_layout;
 }
 
-void MKLDNNDeviceContext::ResetBlobMap() const {
-  VLOG(3) << "Clearing DNNL cache.";
-  p_blobmap_->clear();
+void MKLDNNDeviceContext::ResetBlobMap() {
+  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+  if (!block_next_cache_clearing_) {
+    VLOG(3) << "Clearing DNNL cache.";
+    p_blobmap_->clear();
+  } else {
+    VLOG(3) << "Prevented Clearing DNNL cache.";
+    block_next_cache_clearing_ = false;
+  }
+}
+
+void MKLDNNDeviceContext::BlockNextCacheClearing() {
+  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+  VLOG(3) << "Next DNNL cache clearing has been blocked.";
+  block_next_cache_clearing_ = true;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 7511edb9ccf2c6ca1d5aea2964799b8be08064b6..8bfdfc8a1c6033a79c197e1cd425197f77079bda 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -43,6 +43,10 @@ limitations under the License. */
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -76,6 +80,35 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
   using TYPE = CPUDeviceContext;
 };
 
+#ifdef PADDLE_WITH_XPU
+class XPUDeviceContext : public DeviceContext {
+ public:
+  XPUDeviceContext();
+  explicit XPUDeviceContext(XPUPlace place);
+  virtual ~XPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  xpu::Context* x_context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+ private:
+  XPUPlace place_;
+  xpu::Context* context_;
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in XPU
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(XPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::XPUPlace> {
+  using TYPE = XPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice;
@@ -487,7 +520,10 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   const mkldnn::engine& GetEngine() const { return engine_; }
 
   // Remove all entries from the blob map
-  void ResetBlobMap() const;
+  void ResetBlobMap();
+
+  // Prevent next ResetBlobMap()
+  void BlockNextCacheClearing();
 
   // Get the ShapeBlob size in cur_mkldnn_session_id.
   size_t GetShapeBlobSize() const;
@@ -506,6 +542,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   mkldnn::engine engine_;
   std::shared_ptr<BlobMap> p_blobmap_;
   std::shared_ptr<std::mutex> p_mutex_;
+  bool block_next_cache_clearing_ = false;
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3de2e3957a990a254ffb762f996876a122a865bc
--- /dev/null
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/device_context.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::XPUPlace;
+
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    XPUDeviceContext* device_context = new XPUDeviceContext(XPUPlace(i));
+    xpu::Context* ctx = device_context->x_context();
+    ASSERT_NE(nullptr, ctx);
+    delete device_context;
+  }
+}
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::XPUDeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::XPUPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> xpu_places;
+  int count = paddle::platform::GetXPUDeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(XPUPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 0eb28f0c0c3561f98891ff2a0ab5a26a20b07fb4..ebeb14e940e5fd904e506bca565c4aeae84c93cf 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -100,6 +100,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnCreateDropoutDescriptor);                  \
   __macro(cudnnDropoutGetStatesSize);                     \
   __macro(cudnnSetDropoutDescriptor);                     \
+  __macro(cudnnRestoreDropoutDescriptor);                 \
   __macro(cudnnCreateRNNDescriptor);                      \
   __macro(cudnnGetRNNParamsSize);                         \
   __macro(cudnnGetRNNWorkspaceSize);                      \
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 60e299385d6a6433d11753c7a0b96958b48a8e2a..67a79ce4bb1594afd23d960d18b75a8f0f1b2513 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -36,26 +36,29 @@ extern void* tensorrt_dso_handle;
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
-      using tensorrt_func = decltype(&::__name);                              \
       std::call_once(tensorrt_dso_flag, []() {                                \
         tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
-        PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
-                                platform::errors::Unavailable(                \
-                                    "Load tensorrt %s failed", #__name));     \
       });                                                                     \
       static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
-      PADDLE_ENFORCE_NOT_NULL(                                                \
-          p_##__name,                                                         \
-          platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \
+      if (p_##__name == nullptr) {                                            \
+        return nullptr;                                                       \
+      }                                                                       \
+      using tensorrt_func = decltype(&::__name);                              \
       return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
     }                                                                         \
   };                                                                          \
   extern DynLoad__##__name __name
 
+#if (NV_TENSORRT_MAJOR >= 6)
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
   __macro(createInferBuilder_INTERNAL);     \
   __macro(createInferRuntime_INTERNAL);     \
   __macro(getPluginRegistry);
+#else
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+#endif
 
 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 9a3a639579bd9d44f257c3f0f1aa63e0ae27e8e2..ce1ec507307a2721e641ac15425c6a2321e514c7 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -33,6 +33,7 @@ limitations under the License. */
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
 #include "paddle/fluid/platform/cuda_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
@@ -69,6 +70,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+DECLARE_int32(call_stack_level);
+
 namespace paddle {
 namespace platform {
 
@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) {
   return str;
 }
 
-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
+inline std::string GetCurrentTraceBackString() {
   static constexpr int TRACE_STACK_LIMIT = 100;
   std::ostringstream sout;
 
@@ -256,14 +257,32 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
 #else
   sout << "Windows not support stack backtrace yet.\n";
 #endif
+  return sout.str();
+}
+
+template <typename StrType>
+inline std::string GetErrorSumaryString(StrType&& what, const char* file,
+                                        int line) {
+  std::ostringstream sout;
   sout << "\n----------------------\nError Message "
           "Summary:\n----------------------\n";
-  sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
+  sout << string::Sprintf("%s (at %s:%d)", std::forward<StrType>(what), file,
                           line)
        << std::endl;
   return sout.str();
 }
 
+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what, const char* file,
+                                      int line) {
+  if (FLAGS_call_stack_level > 1) {
+    // FLAGS_call_stack_level>1 means showing c++ call stack
+    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
+  } else {
+    return GetErrorSumaryString(what, file, line);
+  }
+}
+
 inline bool is_error(bool stat) { return !stat; }
 
 inline void throw_on_error(bool stat, const std::string& msg) {
@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception {
  *
  * Examples:
  *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
-*/
+ */
 #define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                   \
   (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {          \
     auto* __ptr = (__PTR);                                                  \
@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception {
  *
  * Examples:
  *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
-*/
+ */
 #define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                   \
   do {                                                                      \
     PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound(     \
@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception {
  * Note: GCC 4.8 cannot select right overloaded function here, so need
  *    to define different functions and macros here, after we upgreade
  *    CI gcc version, we can only define one BOOST_GET macro.
-*/
+ */
 namespace details {
 
 #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr,      \
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 98bdf1f8c675da4e3a272945d605563e35016f8d..8667375c6f2726f1099c6e57c6e793252b01d454 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
  * Note:
  */
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+
+/**
+ * Debug related FLAG
+ * Name: FLAGS_call_stack_level
+ * Since Version: 2.0.0
+ * Value Range: int, default=2
+ * Example:
+ * Note: Used to debug. Determine the call stack to print when error or
+ * exeception happens.
+ * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
+ * If FLAGS_call_stack_level == 1, the python stack and  error message summary
+ * will be shown.
+ * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
+ * message summary will be shown.
+ */
+DEFINE_int32(
+    call_stack_level, 2,
+    "Determine the call stack to print when error or exeception happens."
+    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
+    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
+    // "shown. "
+    "If FLAGS_call_stack_level == 1, the python stack and error message "
+    "summary will be shown."
+    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
+    "error message summary will be shown.");
diff --git a/paddle/fluid/platform/gloo_context.cc b/paddle/fluid/platform/gloo_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32e7299d319c91891c7c05dd1e8cfa85e99a0422
--- /dev/null
+++ b/paddle/fluid/platform/gloo_context.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/gloo_context.h"
+
+namespace paddle {
+namespace platform {
+#if defined(PADDLE_WITH_GLOO)
+void GlooParallelContext::Init() {
+  auto gloo_ptr = paddle::framework::GlooWrapper::GetInstance();
+  gloo_ptr->SetRank(strategy_.rank);
+  gloo_ptr->SetSize(strategy_.rank_num);
+  gloo_ptr->SetPrefix(strategy_.prefix);
+  gloo_ptr->SetIface(strategy_.iface);
+  gloo_ptr->SetTimeoutSeconds(strategy_.init_seconds, strategy_.run_seconds);
+  gloo_ptr->SetHdfsStore(strategy_.path, strategy_.fs_name, strategy_.fs_ugi);
+  gloo_ptr->Init();
+}
+#endif
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/gloo_context.h b/paddle/fluid/platform/gloo_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7dcf288a22c71c29fc22ec5e131249662214b8d
--- /dev/null
+++ b/paddle/fluid/platform/gloo_context.h
@@ -0,0 +1,51 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+
+namespace paddle {
+namespace platform {
+
+#if defined(PADDLE_WITH_GLOO)
+struct GlooParallelStrategy {
+  int rank{0};
+  int rank_num{1};
+  std::string iface;
+  std::string prefix;
+  int init_seconds{9999999};
+  int run_seconds{9999999};
+  std::string path;
+  std::string fs_name;
+  std::string fs_ugi;
+};
+
+class GlooParallelContext {
+ public:
+  explicit GlooParallelContext(const GlooParallelStrategy& strategy)
+      : strategy_(strategy) {}
+
+  virtual ~GlooParallelContext() {}
+
+  virtual void Init();
+
+ protected:
+  GlooParallelStrategy strategy_;
+};
+#endif
+
+}  //  namespace platform
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 5f63233d8bee4beefd6e1695d8bc3d6e5e4ae7fb..ca1e5501c6a84e6136c28f564a78a7e63f0ee8d4 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
@@ -38,11 +39,11 @@ USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
 
-/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
-faster way to query device properties. You can see details in
-https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-*/
+int CudnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
 
+  return dynload::cudnnGetVersion();
+}
 static int GetCUDADeviceCountImpl() {
   int driverVersion = 0;
   cudaError_t status = cudaDriverGetVersion(&driverVersion);
@@ -73,6 +74,10 @@ int GetCUDADeviceCount() {
   return dev_cnt;
 }
 
+/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
+faster way to query device properties. You can see details in
+https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
+*/
 int GetCUDAComputeCapability(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 6a9893647172e2c63f4749fdb0ae1cb0fdfaaf04..ec77447ef77dbb1cd7ee180176f95a9ab8f7c03a 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,6 +23,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
+//! Get the version of cudnn
+int CudnnVersion();
 
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 261f6e807a22d328a20156bed8ee9974637898c3..2e708e44fd0e49e1c33e048084d15e13c6e4d57e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -33,6 +33,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 DECLARE_int32(paddle_num_threads);
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
              "Multiple of the CUPTI device buffer size. If the timestamps have "
@@ -151,6 +156,14 @@ void InitDevices(bool init_p2p) {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetXPUSelectedDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
+  }
 #endif
   InitDevices(init_p2p, devices);
 }
@@ -165,7 +178,13 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
       LOG(WARNING) << "Invalid devices id.";
       continue;
     }
+
+#ifdef PADDLE_WITH_CUDA
     places.emplace_back(platform::CUDAPlace(devices[i]));
+#endif
+#ifdef PADDLE_WITH_XPU
+    places.emplace_back(platform::XPUPlace(devices[i]));
+#endif
   }
   if (init_p2p) {
     InitP2P(devices);
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 6392c4f4c42af9030e9dd0b3373df60938a4676f..f14fbdd74f95bfbed53ff787af861ce4656159c0 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -20,7 +20,7 @@ TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
   InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
@@ -39,6 +39,18 @@ TEST(InitDevices, CUDA) {
 #endif
 }
 
+TEST(InitDevices, XPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_XPU
+  int count = paddle::platform::GetXPUDeviceCount();
+  InitDevices(true);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
+
 #ifndef _WIN32
 TEST(SignalHandle, SignalHandle) {
   std::string msg = "Signal raises";
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index c147bdccbe99e505a8fd8f1ec75c487b00c02067..3782eb684f21f8c09e9dac124082ae596fe5d1bc 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -129,6 +129,16 @@ inline void ClearMKLDNNCache(const platform::Place& place) {
   }
 }
 
+inline void DontClearMKLDNNCache(const platform::Place& place) {
+  // Clear mkl-dnn cache,
+  if (platform::is_cpu_place(place)) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::MKLDNNDeviceContext* dev_ctx =
+        (platform::MKLDNNDeviceContext*)pool.Get(place);
+    dev_ctx->BlockNextCacheClearing();
+  }
+}
+
 template <typename Type>
 mkldnn::memory::data_type MKLDNNGetDataType() {
   return mkldnn::memory::data_type::undef;
@@ -181,6 +191,8 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(
     if (inner_nblks == 0) {
       if (strides[0] >= strides[1] && strides[1] >= strides[2]) {
         return mkldnn::memory::format_tag::ncw;
+      } else if (strides[1] >= strides[0] && strides[0] >= strides[2]) {
+        return mkldnn::memory::format_tag::ntc;
       } else {
         return mkldnn::memory::format_tag::nwc;
       }
@@ -420,5 +432,12 @@ inline std::vector<std::vector<int64_t>> ToMkldnnPadding(
   }
 }
 
+inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
+  return (op->GetAttrIfExists<std::string>("mkldnn_data_type") == "int8" ||
+          op->GetAttrIfExists<bool>("use_quantizer"));
+}
+
+enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 5d7143f56b3f394bb1a99c1b3802b7c20138dfb7..d1c5480c0f5438826e6eb5cc0de211ee1af74cf7 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -82,17 +82,21 @@ class MKLDNNHandlerT {
         fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
   }
 
+  template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
-    T* ptr = output->mutable_data<T>(place_, fwd_pd_->dst_desc().get_size());
+    T_out* ptr =
+        output->mutable_data<T_out>(place_, fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr,
                                             "@dst_mem_p");
   }
 
+  template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const framework::Tensor* output) {
-    const T* output_data = output->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        bwd_pd_->dst_desc(), to_void_cast<T>(output_data), "@bwd-dst_mem_p");
+    const T_out* output_data = output->data<T_out>();
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
+                                            to_void_cast<T_out>(output_data),
+                                            "@bwd-dst_mem_p");
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 195acc1b6d15a91369d48164179cd6e0b5cfac8d..b80d2fd1632cd82c231fae724fc4d754b8fed0fc 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -32,6 +32,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const CUDAPlace &p) {
     os_ << "CUDAPlace(" << p.device << ")";
   }
+  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -44,6 +45,10 @@ bool is_gpu_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
+bool is_xpu_place(const Place &p) {
+  return boost::apply_visitor(IsXPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -60,6 +65,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
     if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
+    } else if (is_xpu_place(p1)) {
+      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index eeda10a633b655dee0da9197888738cd94b50809..f95f6954a32e771e7413a766afcfea8b85ff1f7e 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -58,31 +58,58 @@ struct CUDAPinnedPlace {
   inline bool operator<(const CUDAPinnedPlace &) const { return false; }
 };
 
+// Place for Baidu Kunlun Accelerator
+struct XPUPlace {
+  XPUPlace() : XPUPlace(0) {}
+  explicit XPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const XPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const XPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const XPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &gpu) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
 
-class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsXPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place
+    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
  private:
-  using PlaceBase = boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase =
+      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
+  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -98,6 +125,7 @@ class Place : public boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> {
 using PlaceList = std::vector<Place>;
 
 bool is_gpu_place(const Place &);
+bool is_xpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -115,6 +143,16 @@ struct PlaceVisitorWrapper
     return visitor_(cpu);
   }
 
+  typename Visitor::result_type operator()(const XPUPlace &xpu) const {
+#ifdef PADDLE_WITH_XPU
+    return visitor_(xpu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with XPU. Cannot visit xpu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #ifdef PADDLE_WITH_CUDA
     return visitor_(cuda);
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
index e4c1d3def90f191194b46bb9ea27dd27d69dcb8b..13f28c73f4504aea85d6155a3daa8f8f01b26385 100644
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -18,19 +18,32 @@
 TEST(Place, Equality) {
   paddle::platform::CPUPlace cpu;
   paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
+  paddle::platform::XPUPlace x0(0), x1(1), xx0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
   EXPECT_EQ(g1, g1);
   EXPECT_EQ(g0, gg0);
+  EXPECT_EQ(x0, x0);
+  EXPECT_EQ(x1, x1);
+  EXPECT_EQ(x0, xx0);
 
   EXPECT_NE(g0, g1);
+  EXPECT_NE(x0, x1);
 
   EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
   EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(x0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, x0));
 }
 
 TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::XPUPlace(1);
+    EXPECT_EQ("XPUPlace(1)", ss.str());
+  }
   {
     std::stringstream ss;
     ss << paddle::platform::CUDAPlace(1);
diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8c5f85f9cfe4b9d6ac07069fff89d37c695af5b
--- /dev/null
+++ b/paddle/fluid/platform/xpu_header.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include "xpu/api.h"
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+
+namespace xpu = baidu::xpu::api;
+#endif
diff --git a/paddle/fluid/platform/xpu_info.cc b/paddle/fluid/platform/xpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f88248fda7e65e1b96448c0576880a18a9d8a4a9
--- /dev/null
+++ b/paddle/fluid/platform/xpu_info.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/xpu_info.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <string>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/string/split.h"
+
+DEFINE_string(selected_xpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (XPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
+namespace paddle {
+namespace platform {
+
+static int GetXPUDeviceCountImpl() {
+  const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  if (xpu_visible_devices != nullptr) {
+    std::string xpu_visible_devices_str(xpu_visible_devices);
+    if (std::all_of(xpu_visible_devices_str.begin(),
+                    xpu_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected.";
+      return 0;
+    }
+  }
+
+  int count = 0;
+  int ret = xpu_device_count(&count);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  return count;
+}
+
+int GetXPUDeviceCount() {
+  static auto dev_cnt = GetXPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetXPUCurrentDeviceId() {
+  int dev_id;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  return dev_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices() {
+  // use user specified XPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_xpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetXPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id, GetXPUDeviceCount(),
+      platform::errors::InvalidArgument("id must less than XPU count"));
+  int ret = xpu_set_device(id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/xpu_info.h b/paddle/fluid/platform/xpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..efaba13453e7472ed09ff66c70bdaf19eb89549d
--- /dev/null
+++ b/paddle/fluid/platform/xpu_info.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <vector>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of XPU devices in system.
+int GetXPUDeviceCount();
+
+//! Get the current XPU device id in system.
+int GetXPUCurrentDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetXPUSelectedDevices();
+
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index dbc9eb065c4240a7d2dc135965f23ddc153bfd16..d733cf26ed209bcb86eaf2d366e45cfa0e7f9a90 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
-  gloo_wrapper infer_io_utils)
+  gloo_wrapper infer_io_utils heter_wrapper generator)
 
 if (WITH_NCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
@@ -31,12 +31,19 @@ set(PYBIND_SRCS
   global_value_getter_setter.cc
   reader_py.cc
   fleet_wrapper_py.cc
+  heter_wrapper_py.cc
   gloo_wrapper_py.cc
   box_helper_py.cc
   data_set_py.cc
   imperative.cc
   ir.cc
-  inference_api.cc)
+  inference_api.cc
+  generator_py.cc)
+
+if(WITH_GLOO)
+  set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
+  set(PYBIND_SRCS ${PYBIND_SRCS} gloo_context_py.cc)
+endif(WITH_GLOO)
 
 if (WITH_CRYPTO)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_crypto)
@@ -70,13 +77,23 @@ if(WITH_PYTHON)
   set(tmp_impl_file ${impl_file}.tmp)
 
   if(WIN32)
-    add_custom_command(TARGET op_function_generator
-          POST_BUILD
-          COMMAND "${CMAKE_BINARY_DIR}/paddle/fluid/pybind/${CMAKE_BUILD_TYPE}/op_function_generator"
-              "${tmp_impl_file}"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${impl_file}"
-          VERBATIM
+    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
+    "set build_times=1\n"
+    ":retry\n"
+    "ECHO op_function_generator run %build_times% time\n"
+    "${CMAKE_BINARY_DIR}/paddle/fluid/pybind/${CMAKE_BUILD_TYPE}/op_function_generator ${impl_file}\n"
+    "if %ERRORLEVEL% NEQ 0 (\n"
+    "    set /a build_times=%build_times%+1\n"
+    "    if %build_times% GTR 100 (\n"
+    "        exit /b 1\n"
+    "    ) else (\n"
+    "        goto :retry\n"
+    "    )\n"
+    ")\n"
+    "exit /b 0")
+
+    add_custom_command(TARGET op_function_generator POST_BUILD
+          COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
     )
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bccd5fb2dd92298323381c09467937abd87a53c
--- /dev/null
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/pybind/generator_py.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+void BindGenerator(py::module* m) {
+  py::class_<framework::GeneratorState>(*m, "GeneratorState", "");
+  py::class_<std::mt19937_64>(*m, "mt19937_64", "");
+  py::class_<framework::Generator, std::shared_ptr<framework::Generator>>(
+      *m, "Generator")
+      .def(py::init([]() { return framework::Generator::GetInstanceX(); }),
+           py::return_value_policy::reference)
+      .def("get_state", &framework::Generator::GetState,
+           py::return_value_policy::move)
+      .def("set_state", &framework::Generator::SetState)
+      .def("manual_seed", &framework::Generator::SetCurrentSeed)
+      .def("seed", &framework::Generator::Seed)
+      .def("initial_seed", &framework::Generator::GetCurrentSeed)
+      .def("random", &framework::Generator::Random64)
+      .def("get_cpu_engine", &framework::Generator::GetCPUEngine,
+           py::return_value_policy::move)
+      .def("set_cpu_engine", &framework::Generator::SetCPUEngine);
+}  // end Generator
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/generator_py.h b/paddle/fluid/pybind/generator_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..d37654c1ba24e296fb325d1507187c5a954754bd
--- /dev/null
+++ b/paddle/fluid/pybind/generator_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindGenerator(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 5178b5f89adf3b8a39b303228d1e674b22e7dc2d..deca9625e63d05625c407a1282b396398bb78ccc 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+
 #include <cctype>
 #include <functional>
 #include <string>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic);
 DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
+DECLARE_int32(call_stack_level);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
       FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler,
-      FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode,
-      FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir,
-      FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size,
-      FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem,
-      FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion,
-      FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism,
-      FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads);
+      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
+      FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
+      FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
+      FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
+      FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
+      FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
+      FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
+      FLAGS_paddle_num_threads);
 
 #ifdef PADDLE_WITH_CUDA
   REGISTER_PUBLIC_GLOBAL_VAR(
diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c77b0c3a06ca9a17f33643d88ddf932c32544
--- /dev/null
+++ b/paddle/fluid/pybind/gloo_context_py.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/gloo_context_py.h"
+
+#include <Python.h>
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/platform/gloo_context.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+// Bind Methods
+void BindGlooContext(py::module *m) {
+// define parallel context for gloo
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<platform::GlooParallelStrategy> gloo_parallel_strategy(
+      *m, "GlooParallelStrategy", "");
+  gloo_parallel_strategy.def(py::init())
+      .def_property("rank_num",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.rank_num;
+                    },
+                    [](platform::GlooParallelStrategy &self, int nranks) {
+                      self.rank_num = nranks;
+                    })
+      .def_property(
+          "rank",
+          [](const platform::GlooParallelStrategy &self) { return self.rank; },
+          [](platform::GlooParallelStrategy &self, int rank) {
+            self.rank = rank;
+          })
+      .def_property(
+          "iface",
+          [](const platform::GlooParallelStrategy &self) { return self.iface; },
+          [](platform::GlooParallelStrategy &self, const std::string &iface) {
+            self.iface = iface;
+          })
+      .def_property("prefix",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.prefix;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &prefix) { self.prefix = prefix; })
+      .def_property("init_seconds",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.init_seconds;
+                    },
+                    [](platform::GlooParallelStrategy &self, int init_seconds) {
+                      self.init_seconds = init_seconds;
+                    })
+      .def_property("run_seconds",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.run_seconds;
+                    },
+                    [](platform::GlooParallelStrategy &self, int run_seconds) {
+                      self.run_seconds = run_seconds;
+                    })
+      .def_property(
+          "path",
+          [](const platform::GlooParallelStrategy &self) { return self.path; },
+          [](platform::GlooParallelStrategy &self, const std::string &path) {
+            self.path = path;
+          })
+      .def_property("fs_name",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.fs_name;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &fs_name) { self.fs_name = fs_name; })
+      .def_property("fs_ugi",
+                    [](const platform::GlooParallelStrategy &self) {
+                      return self.fs_ugi;
+                    },
+                    [](platform::GlooParallelStrategy &self,
+                       const std::string &fs_ugi) { self.fs_ugi = fs_ugi; });
+
+  py::class_<platform::GlooParallelContext> gloo_ctx(*m, "GlooParallelContext");
+  gloo_ctx.def(py::init<const platform::GlooParallelStrategy &>())
+      .def("init", [](platform::GlooParallelContext &self) { self.Init(); });
+#endif
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/gloo_context_py.h b/paddle/fluid/pybind/gloo_context_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..89bd183097b7541c33a797f27178bafb934bcd52
--- /dev/null
+++ b/paddle/fluid/pybind/gloo_context_py.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindGlooContext(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/heter_wrapper_py.cc b/paddle/fluid/pybind/heter_wrapper_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0603cd3faae1d7007ec31813373019172baf6eb7
--- /dev/null
+++ b/paddle/fluid/pybind/heter_wrapper_py.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <string>
+#include <vector>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+#ifdef PADDLE_WITH_PSLIB
+void BindHeterWrapper(py::module* m) {
+  py::class_<framework::HeterWrapper, std::shared_ptr<framework::HeterWrapper>>(
+      *m, "Heter")
+      .def(py::init([]() { return framework::HeterWrapper::GetInstance(); }))
+      .def("create_client2xpu_connection",
+           &framework::HeterWrapper::CreateClient2XpuConnection)
+      .def("set_xpu_list", &framework::HeterWrapper::SetXpuList)
+      .def("start_xpu_service", &framework::HeterWrapper::StartXpuService)
+      .def("end_pass", &framework::HeterWrapper::EndPass)
+      .def("stop_xpu_service", &framework::HeterWrapper::StopXpuService);
+}  // end HeterWrapper
+#endif
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/heter_wrapper_py.h b/paddle/fluid/pybind/heter_wrapper_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..047c1f359ea8397998b967afbe30f5c0ff8d6170
--- /dev/null
+++ b/paddle/fluid/pybind/heter_wrapper_py.h
@@ -0,0 +1,29 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+#ifdef PADDLE_WITH_PSLIB
+void BindHeterWrapper(py::module* m);
+#endif
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 82941c58280560b1c09b149da01ef3d6e8a3f8e0..021d10ca7facb0bac11cd5d08eddea7e01b9b566 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -19,13 +19,17 @@ limitations under the License. */
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
+
 #include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/imperative/all_reduce.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/data_loader.h"
@@ -62,11 +66,13 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::CPUPlace>();
   } else if (py::isinstance<platform::CUDAPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPlace>();
+  } else if (py::isinstance<platform::XPUPlace>(place_obj)) {
+    return place_obj.cast<platform::XPUPlace>();
   } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
     return place_obj.cast<platform::CUDAPinnedPlace>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
   }
 }
 
@@ -74,16 +80,23 @@ static void InitTensorForVarBase(imperative::VarBase *self,
                                  const py::array &array,
                                  const platform::Place place,
                                  bool persistable = false,
-                                 bool zero_copy = false,
-                                 std::string name = "") {
+                                 bool zero_copy = false, std::string name = "",
+                                 int stop_gradient = -1) {
   if (name == "") {
-    name = imperative::GetCurrentTracer()->GenerateUniqueName("generated_var");
+    name =
+        imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor");
   }
+  VLOG(5) << "Init Tensor as: / name: " << name
+          << " / persistable: " << persistable << " / zero_copy: " << zero_copy
+          << " / stop_gradient: " << stop_gradient;
   new (self) imperative::VarBase(name);
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
   if (platform::is_cpu_place(place)) {
     SetTensorFromPyArray<platform::CPUPlace>(
         tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy);
+  } else if (platform::is_xpu_place(place)) {
+    SetTensorFromPyArray<platform::XPUPlace>(
+        tensor, array, BOOST_GET_CONST(platform::XPUPlace, place), zero_copy);
   } else if (platform::is_gpu_place(place)) {
     SetTensorFromPyArray<platform::CUDAPlace>(
         tensor, array, BOOST_GET_CONST(platform::CUDAPlace, place), zero_copy);
@@ -93,7 +106,10 @@ static void InitTensorForVarBase(imperative::VarBase *self,
         zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
+  }
+  if (stop_gradient != -1) {
+    self->SetOverridedStopGradient(stop_gradient);
   }
   self->SetPersistable(persistable);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -102,12 +118,11 @@ static void InitTensorForVarBase(imperative::VarBase *self,
 
 static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
                                            const py::kwargs &kwargs) {
-  VLOG(4) << "Init VarBase";
+  VLOG(4) << "Init VarBase from kwargs: ";
   PADDLE_ENFORCE_EQ(
       kwargs.contains("value"), true,
       platform::errors::NotFound(
           "The kwargs used to create Varbase misses argument: value"));
-
   auto persistable = kwargs.contains("persistable")
                          ? kwargs["persistable"].cast<bool>()
                          : false;
@@ -116,10 +131,14 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
   auto zero_copy =
       kwargs.contains("zero_copy") ? kwargs["zero_copy"].cast<bool>() : false;
   auto name = kwargs.contains("name") ? kwargs["name"].cast<std::string>() : "";
+  auto stop_gradient = kwargs.contains("stop_gradient")
+                           ? kwargs["stop_gradient"].cast<int>()
+                           : -1;
   auto default_place = imperative::GetCurrentTracer()->ExpectedPlace();
   auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"])
                                         : default_place;
-  InitTensorForVarBase(self, array, place, persistable, zero_copy, name);
+  InitTensorForVarBase(self, array, place, persistable, zero_copy, name,
+                       stop_gradient);
 }
 
 template <typename P>
@@ -127,15 +146,24 @@ static void InitVarBaseFromNumpyWithArg(imperative::VarBase *self,
                                         const py::array &array, const P &place,
                                         bool persistable = false,
                                         bool zero_copy = false,
-                                        std::string name = "") {
-  VLOG(4) << "Init VarBase";
-  // 0: self, 1: value, 2: place, 3: persistable, 4: zero_copy, 5: name
+                                        std::string name = "",
+                                        int stop_gradient = -1) {
+  VLOG(4) << "Init VarBase from Arg: ";
+  // 0: self, 1: value, 2: place, 3: persistable, 4: zero_copy, 5: name , 6:
+  // stop_gradient
   if (name == "") {
-    name = imperative::GetCurrentTracer()->GenerateUniqueName("generated_var");
+    name =
+        imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor");
   }
+  VLOG(5) << "Init Tensor as: / name: " << name
+          << " / persistable: " << persistable << " / zero_copy: " << zero_copy
+          << " / stop_gradient: " << stop_gradient;
   new (self) imperative::VarBase(name);
   self->SetPersistable(persistable);
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
+  if (stop_gradient != -1) {
+    self->SetOverridedStopGradient(stop_gradient);
+  }
   SetTensorFromPyArray<P>(tensor, array, place, zero_copy);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor->type());
@@ -143,7 +171,7 @@ static void InitVarBaseFromNumpyWithArg(imperative::VarBase *self,
 
 static void InitVarBaseFromNumpyWithArgDefault(imperative::VarBase *self,
                                                const py::array &array) {
-  VLOG(4) << "Init VarBase";
+  VLOG(4) << "Init VarBase from numpy: ";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
   InitTensorForVarBase(self, array, place);
 }
@@ -153,7 +181,7 @@ static void InitVarBaseFromTensorWithArgDefault(
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
   new (self) imperative::VarBase(
-      imperative::GetCurrentTracer()->GenerateUniqueName("generated_var"));
+      imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor"));
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor.type());
@@ -537,8 +565,7 @@ void BindImperative(py::module *m_ptr) {
         });
 
   py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
-      m, "VarBase",
-      R"DOC()DOC")
+      m, "VarBase", R"DOC()DOC")
       .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
       .def("__init__",
            [](imperative::VarBase &self, framework::proto::VarType::Type dtype,
@@ -548,7 +575,7 @@ void BindImperative(py::module *m_ptr) {
              std::string act_name = "";
              if (!name.ptr() || name.ptr() == Py_None) {
                act_name = imperative::GetCurrentTracer()->GenerateUniqueName(
-                   "generated_var");
+                   "generated_tensor");
              } else {
                act_name = name.cast<std::string>();
              }
@@ -564,13 +591,20 @@ void BindImperative(py::module *m_ptr) {
            })
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CPUPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "")
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "")
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPinnedPlace>,
            py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "")
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
       .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"))
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
@@ -793,6 +827,15 @@ void BindImperative(py::module *m_ptr) {
            [](const imperative::VarBase &self, const platform::CPUPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const imperative::VarBase &self,
+              const platform::CUDAPinnedPlace &place,
+              bool blocking) { return self.NewVarBase(place, blocking); },
+           py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::XPUPlace &place,
+              bool blocking) { return self.NewVarBase(place, blocking); },
+           py::return_value_policy::copy)
       .def("_copy_to",
            [](const imperative::VarBase &self, const platform::CUDAPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
@@ -821,6 +864,9 @@ void BindImperative(py::module *m_ptr) {
               return std::vector<int>();
             }
           })
+      .def_property_readonly(
+          "place", [](imperative::VarBase &self) { return self.Place(); },
+          py::return_value_policy::copy)
       .def_property_readonly("type", &imperative::VarBase::Type)
       .def_property_readonly("dtype", &imperative::VarBase::DataType);
 
@@ -838,13 +884,14 @@ void BindImperative(py::module *m_ptr) {
       .def("reset", &imperative::jit::ProgramDescTracer::Reset);
 
   py::class_<imperative::Tracer, std::shared_ptr<imperative::Tracer>>(
-      m, "Tracer",
-      R"DOC()DOC")
+      m, "Tracer", R"DOC()DOC")
       .def("__init__",
            [](imperative::Tracer &self) { new (&self) imperative::Tracer(); })
       .def_property("_enable_program_desc_tracing",
                     &imperative::Tracer::IsProgramDescTracingEnabled,
                     &imperative::Tracer::SetEnableProgramDescTracing)
+      .def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled,
+                    &imperative::Tracer::SetEnableAutoCast)
       .def_property("_train_mode", &imperative::Tracer::HasGrad,
                     &imperative::Tracer::SetHasGrad)
       .def_property(
@@ -856,6 +903,9 @@ void BindImperative(py::module *m_ptr) {
             if (py::isinstance<platform::CUDAPlace>(obj)) {
               auto p = obj.cast<platform::CUDAPlace *>();
               self.SetExpectedPlace(*p);
+            } else if (py::isinstance<platform::XPUPlace>(obj)) {
+              auto p = obj.cast<platform::XPUPlace *>();
+              self.SetExpectedPlace(*p);
             } else if (py::isinstance<platform::CPUPlace>(obj)) {
               auto p = obj.cast<platform::CPUPlace *>();
               self.SetExpectedPlace(*p);
@@ -864,7 +914,8 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
             } else {
               PADDLE_THROW(platform::errors::InvalidArgument(
-                  "Incompatible Place Type: supports CUDAPlace, CPUPlace, "
+                  "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
+                  "CPUPlace, "
                   "and CUDAPinnedPlace, "
                   "but got Unknown Type!"));
             }
@@ -874,6 +925,39 @@ void BindImperative(py::module *m_ptr) {
            py::return_value_policy::reference)
       .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName,
            py::arg("key") = "eager_tmp")
+      .def(
+          "_set_amp_op_list",
+          [](imperative::Tracer &self,
+             std::unordered_set<std::string> &allow_ops,
+             std::unordered_set<std::string> &block_ops) {
+            // NOTE(zhiqiu): The automatic conversion in pybind11 between c++
+            // STL and python set/list/dict involve a copy operation that
+            // prevents pass-by-reference semantics, so it is ok to swap.
+            // The reaseon why not directly pass
+            // std::shared_ptr<std::unordered_set<std::string>>
+            // is that pybind11 forbid shared_ptr<T> where T is not custom type.
+            imperative::AmpOperators::Instance().GetAllowOps()->swap(allow_ops);
+            imperative::AmpOperators::Instance().GetBlockOps()->swap(block_ops);
+          })
+      .def("_get_amp_op_list",
+           [](imperative::Tracer &self) {
+             return std::make_tuple(
+                 *(imperative::AmpOperators::Instance().GetAllowOps()),
+                 *(imperative::AmpOperators::Instance().GetBlockOps()));
+           })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::XPUPlace &place,
+              bool trace_backward) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
+                            std::move(attrs), place, trace_backward);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 597ead9327e233df785b58437afce8fa75a058c3..70b321f658cd2cf1bd43cd6440bf83e1f4dab140 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -18,9 +18,11 @@
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/variable.h"
@@ -31,15 +33,93 @@
 namespace py = pybind11;
 namespace paddle {
 namespace pybind {
-static inline void ConstructAttrMapFromPyArgs(framework::AttributeMap* attrs,
+
+static inline std::shared_ptr<imperative::VarBase> CastPyHandleToVarBase(
+    const std::string& op_type, const std::string& arg_name, int arg_idx,
+    const py::handle& handle) {
+  PyObject* py_obj = handle.ptr();  // get underlying PyObject
+  if (!py_obj || py_obj == Py_None) {
+    return nullptr;
+  }
+  try {
+    return py::cast<std::shared_ptr<imperative::VarBase>>(py::handle(py_obj));
+  } catch (py::cast_error&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be Tensor, but got "
+        "%s",
+        op_type, arg_name, arg_idx, Py_TYPE(py_obj)->tp_name));
+  }
+}
+
+static inline std::vector<std::shared_ptr<imperative::VarBase>>
+CastPyHandleToVarBaseList(const std::string& op_type,
+                          const std::string& arg_name, int arg_idx,
+                          const py::handle& handle) {
+  PyObject* py_obj = handle.ptr();  // get underlying PyObject
+  if (!py_obj || py_obj == Py_None) {
+    return {};
+  }
+  std::vector<std::shared_ptr<imperative::VarBase>> result;
+  if (PyList_Check(py_obj) || PyTuple_Check(py_obj)) {
+    auto size = PyTuple_Check(py_obj) ? PyTuple_GET_SIZE(py_obj)
+                                      : PyList_GET_SIZE(py_obj);
+    for (auto i = 0; i < size; ++i) {
+      PyObject* item = PyTuple_Check(py_obj) ? PyTuple_GET_ITEM(py_obj, i)
+                                             : PyList_GET_ITEM(py_obj, i);
+      if (!item || item == Py_None) {
+        result.emplace_back(nullptr);
+        continue;
+      }
+      try {
+        result.emplace_back(
+            py::cast<std::shared_ptr<imperative::VarBase>>(py::handle(item)));
+      } catch (py::cast_error&) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument '%s' (position %d) must be list of "
+            "Tensors, but "
+            "got %s in list (item %d)",
+            op_type, arg_name, arg_idx, Py_TYPE(item)->tp_name, i));
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx, Py_TYPE(py_obj)->tp_name));
+  }
+  return result;
+}  // namespace pybind
+
+static inline void ConstructAttrMapFromPyArgs(const std::string& op_type,
+                                              int start_idx,
+                                              framework::AttributeMap* attrs,
                                               const py::args& args) {
   PADDLE_ENFORCE_EQ(
       args.size() % 2, 0,
       platform::errors::InvalidArgument(
           "The number of arguments for arributes should be even."));
   for (size_t i = 0; i < args.size(); i += 2) {
-    auto name = args[i].cast<std::string>();
-    auto value = args[i + 1].cast<framework::Attribute>();
+    std::string name;
+    framework::Attribute value;
+    try {
+      name = args[i].cast<std::string>();
+    } catch (std::exception& e) {
+      PyObject* py_obj = args[i].ptr();  // get underlying PyObject
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be str, but got "
+          "%s",
+          op_type, start_idx + i, Py_TYPE(py_obj)->tp_name));
+    }
+    try {
+      value = args[i + 1].cast<framework::Attribute>();
+    } catch (std::exception& e) {
+      PyObject* py_obj = args[i + 1].ptr();  // get underlying PyObject
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be "
+          "Attribute type (one of str, bool, int, int64, float, or list of "
+          "them), but got %s",
+          op_type, start_idx + i + 1, Py_TYPE(py_obj)->tp_name));
+    }
     (*attrs)[name] = value;
   }
 }
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 7412eede118d122b14c69ab663836c156eb740e2..256faf04ea6de5835f22113537caac49ca1dbab4 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -40,6 +40,9 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"assign", {"X"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"X", "InScale", "InAccum", "InState"}},
+    {"nll_loss", {"X", "Label", "Weight"}},
+    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
+    {"gather", {"X", "Index", "Axis"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -56,6 +59,10 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
+    {"sync_batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -75,9 +82,23 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"momentum", {"ParamOut", "VelocityOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
+    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
     {"matmul", {"Out"}},
+    {"c_broadcast", {"Out"}},
+    {"c_allreduce_sum", {"Out"}},
+    {"c_allreduce_max", {"Out"}},
+    {"c_allreduce_min", {"Out"}},
+    {"c_allreduce_prod", {"Out"}},
+    {"c_reduce_sum", {"Out"}},
+    {"c_reduce_max", {"Out"}},
+    {"c_reduce_min", {"Out"}},
+    {"c_reduce_prod", {"Out"}},
+    {"c_reduce", {"Out"}},
+    {"c_allgather", {"Out"}},
+    {"c_scatter", {"Out"}},
+    {"barrier", {"Out"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
@@ -115,8 +136,19 @@ const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
 const char* ARG_OUT_NUM = R"(%sNum)";
 const char* ARG_OUT_NUM_TYPE = R"(size_t )";
 
-const char* VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
-const char* VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+const char* IN_VAR_TYPE = R"(py::handle)";
+const char* IN_VAR_LIST_TYPE = R"(py::handle)";
+
+const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
+const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+
+const char* CAST_VAR_TEMPLATE = R"(
+  auto %s = CastPyHandleToVarBase("%s", "%s", %d, %s);)";
+
+const char* CAST_VAR_LIST_TEMPLATE = R"(
+  auto %s = CastPyHandleToVarBaseList("%s", "%s", %d, %s);)";
+
+
 const char* ARG_TEMPLATE = R"(const %s& %s)";
 
 const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)";
@@ -132,8 +164,9 @@ const char* OP_FUNCTION_TEMPLATE =
 R"(
 %s %s(%s)
 {
+  %s
   framework::AttributeMap attrs;
-  ConstructAttrMapFromPyArgs(&attrs, args);
+  ConstructAttrMapFromPyArgs("%s", %d, &attrs, args);
   {
     py::gil_scoped_release release;
     auto tracer = imperative::GetCurrentTracer();
@@ -163,6 +196,10 @@ static inline bool FindPassingOutsMap(const std::string& op_type,
   return op_passing_outs_map[op_type].count(out_name);
 }
 
+static inline std::string TempName(const std::string& name) {
+  return name + '_';
+}
+
 static std::tuple<std::vector<std::string>, std::vector<std::string>>
 GenerateOpFunctions(const std::string& module_name) {
   auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
@@ -186,16 +223,26 @@ GenerateOpFunctions(const std::string& module_name) {
     std::string ins_initializer = "{";
     std::string ins_initializer_with_null = "";
     std::string py_arg = "";
+    int arg_idx = 0;
+    int input_args_num = 0;
+    std::string ins_cast_str = "";
     for (auto& input : op_proto->inputs()) {
       auto& in_name = input.name();
       // skip those dispensable inputs, like ResidualData in conv2d
       if (input.dispensable() && !FindInsMap(op_type, in_name)) {
         continue;
       }
-      const auto in_type = input.duplicable() ? VAR_LIST_TYPE : VAR_TYPE;
-      auto input_arg = paddle::string::Sprintf(ARG_TEMPLATE, in_type, in_name);
+      const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
+      auto input_arg =
+          paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
       input_args += input_arg;
       input_args += ",";
+      input_args_num++;
+      const auto in_cast_type =
+          input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      ins_cast_str +=
+          paddle::string::Sprintf(in_cast_type, in_name, op_type, in_name,
+                                  arg_idx++, TempName(in_name));
 
       if (input.dispensable()) {
         const auto in_template = input.duplicable()
@@ -234,7 +281,8 @@ GenerateOpFunctions(const std::string& module_name) {
       if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
         continue;
       }
-      const auto out_type = output.duplicable() ? VAR_LIST_TYPE : VAR_TYPE;
+      const auto out_type =
+          output.duplicable() ? OUT_VAR_LIST_TYPE : OUT_VAR_TYPE;
       const auto return_template =
           output.duplicable() ? RETURN_LIST_TEMPLATE : RETURN_TEMPLATE;
       if (FindPassingOutsMap(op_type, out_name)) {
@@ -243,6 +291,7 @@ GenerateOpFunctions(const std::string& module_name) {
         }
         input_args += out_type;
         input_args += out_name;
+        input_args_num++;
 
         if (output.dispensable()) {
           const auto out_template =
@@ -269,6 +318,7 @@ GenerateOpFunctions(const std::string& module_name) {
           auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
           input_args += ARG_OUT_NUM_TYPE;
           input_args += out_num_str;
+          input_args_num++;
           outs_initializer += paddle::string::Sprintf(
               OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str);
         } else {
@@ -308,9 +358,9 @@ GenerateOpFunctions(const std::string& module_name) {
     // generate op funtcion body
     auto op_function_str = paddle::string::Sprintf(
         OP_FUNCTION_TEMPLATE, return_type, func_name, function_args,
-        outs_initializer, ins_initializer,
-        ins_initializer_with_null + outs_initializer_with_null, op_type,
-        return_str);
+        ins_cast_str, op_type, input_args_num, outs_initializer,
+        ins_initializer, ins_initializer_with_null + outs_initializer_with_null,
+        op_type, return_str);
 
     // generate pybind item
     auto bind_function_str = paddle::string::Sprintf(
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d58c36dd8f20e35fe4a564bd7e119c17f1296ba2..4b8f7c853ceaf2148722a9c65f38e0ec3d9f4df5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -64,8 +64,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
@@ -88,6 +91,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/fluid/pybind/communicator_py.h"
 #endif
@@ -116,6 +123,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithXPU() {
+#ifndef PADDLE_WITH_XPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -340,6 +355,10 @@ PYBIND11_MODULE(core_noavx, m) {
 
   m.def("set_num_threads", &platform::SetNumThreads);
 
+#ifdef PADDLE_WITH_CUDA
+  m.def("cudnn_version", &platform::CudnnVersion);
+#endif
+
   m.def("from_dlpack", [](py::capsule *dltensor) {
     DLManagedTensor *dmt = reinterpret_cast<DLManagedTensor *>(
         PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
@@ -465,6 +484,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_float",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
@@ -477,6 +500,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("_alloc_int",
+           [](Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
       .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
@@ -494,6 +521,11 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
+      .def("_mutable_data",
+           [](Tensor &self, paddle::platform::XPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("_mutable_data",
            [](Tensor &self, paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
@@ -507,6 +539,8 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_clear", &Tensor::clear)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
@@ -516,7 +550,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1069,7 +1103,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("find_var", &Scope::FindVar, py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found.
+           its parent scope. Return None if not found. 
 
            Args:
                name (str): the variable name.
@@ -1212,8 +1246,6 @@ All parameter, weight, gradient are variables in Paddle.
         []() { return std::string(framework::kEmptyVarName); });
   m.def("grad_var_suffix",
         []() { return std::string(framework::kGradVarSuffix); });
-  m.def("loaded_var_suffix",
-        []() { return std::string(framework::kLoadedVarSuffix); });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -1226,6 +1258,18 @@ All parameter, weight, gradient are variables in Paddle.
                       -> paddle::platform::DeviceContext* {
                     return new paddle::platform::CPUDeviceContext();
                   })
+      .def_static("create",
+                  [](paddle::platform::XPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_XPU
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use XPUPlace in CPU/GPU version, "
+                 "Please recompile or reinstall Paddle with XPU support."));
+#else
+                    return new paddle::platform::XPUDeviceContext(place);
+#endif
+                  })
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
@@ -1320,14 +1364,75 @@ All parameter, weight, gradient are variables in Paddle.
              std::exit(-1);
 #endif
            })
+#ifdef PADDLE_WITH_CUDA
+      .def("get_device_id",
+           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
+      .def("_get_device_id",
+           [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
+#endif
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
+  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
+    **Note**:
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          xpu_place = fluid.XPUPlace(0)
+        )DOC")
+      .def("__init__",
+           [](platform::XPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_XPU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid XPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
+               if (platform::GetXPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetXPUDeviceCount(),
+                     platform::GetXPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::XPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use XPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use XPU, please try to install XPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change XPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::XPUPlace &>);
+
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1342,6 +1447,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1376,6 +1482,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1388,11 +1496,14 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
       .def("is_cpu_place",
            [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_xpu_place",
+           [](platform::Place &self) { return platform::is_xpu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1401,12 +1512,20 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::CUDAPlace, self).device;
            })
+      .def("xpu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::XPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
+             self = xpu_place;
+           })
       .def("set_place",
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
@@ -1434,6 +1553,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::XPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1534,6 +1656,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
@@ -2479,6 +2602,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("device_count", &ParallelExecutor::DeviceCount);
 
   BindFleetWrapper(&m);
+#ifdef PADDLE_WITH_PSLIB
+  BindHeterWrapper(&m);
+#endif
   BindGlooWrapper(&m);
   BindBoxHelper(&m);
 #ifdef PADDLE_WITH_BOX_PS
@@ -2486,11 +2612,15 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #ifdef PADDLE_WITH_NCCL
   BindNCCLWrapper(&m);
+#endif
+#ifdef PADDLE_WITH_GLOO
+  BindGlooContext(&m);
 #endif
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
   BindDataset(&m);
+  BindGenerator(&m);
 #ifdef PADDLE_WITH_CRYPTO
   BindCrypto(&m);
 #endif
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 0dd30e562b66847551e5f27b45042fb077fc7bc7..856c5aac5eb38c7da82a956c5823d1a19be5c8d7 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -125,11 +125,12 @@ class MultiDeviceFeedReader {
       const std::vector<framework::proto::VarType::Type> &dtypes,
       const std::vector<bool> &need_check_feed,
       const std::vector<platform::Place> &dst_places, bool use_double_buffer,
-      bool drop_last)
+      bool drop_last, bool pin_memory = false)
       : queue_(queue),
         names_(names),
         pool_(new ::ThreadPool(dst_places.size())),
-        drop_last_(drop_last) {
+        drop_last_(drop_last),
+        pin_memory_(pin_memory) {
     std::vector<framework::DDim> dims;
     for (auto &shape : shapes) {
       dims.push_back(framework::make_ddim(shape));
@@ -157,7 +158,7 @@ class MultiDeviceFeedReader {
         VLOG(10) << "Creating " << i << "-th BufferedReader";
         holder->Reset(
             framework::MakeDecoratedReader<operators::reader::BufferedReader>(
-                reader, p, 2));
+                reader, p, 2, pin_memory_));
       } else {
         if (platform::is_gpu_place(p)) {
           PADDLE_THROW(platform::errors::PermissionDenied(
@@ -322,6 +323,7 @@ class MultiDeviceFeedReader {
 
   std::vector<std::vector<framework::LoDTensor>> ret_;
   bool drop_last_;
+  bool pin_memory_;
 };
 
 template <typename QueueType>
@@ -445,10 +447,10 @@ void BindReader(py::module *module) {
            const std::vector<framework::proto::VarType::Type> &dtypes,
            const std::vector<bool> &need_check_feed,
            const std::vector<platform::Place> &dst_places,
-           bool use_double_buffer, bool drop_last) {
+           bool use_double_buffer, bool drop_last, bool pin_memory) {
           return new MultiDeviceFeedReader<reader::LoDTensorBlockingQueue>(
               queue, names, shapes, dtypes, need_check_feed, dst_places,
-              use_double_buffer, drop_last);
+              use_double_buffer, drop_last, pin_memory);
         },
         py::return_value_policy::take_ownership);
 
@@ -461,12 +463,12 @@ void BindReader(py::module *module) {
          const std::vector<framework::proto::VarType::Type> &dtypes,
          const std::vector<bool> &need_check_feed,
          const std::vector<platform::Place> &dst_places, bool use_double_buffer,
-         bool drop_last) {
+         bool drop_last, bool pin_memory) {
         queue->SetDeviceCount(dst_places.size());
         return new MultiDeviceFeedReader<
             reader::OrderedMultiDeviceLoDTensorBlockingQueue>(
             queue, names, shapes, dtypes, need_check_feed, dst_places,
-            use_double_buffer, drop_last);
+            use_double_buffer, drop_last, pin_memory);
       },
       py::return_value_policy::take_ownership);
 }
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ba79c4b44374eb9b50ad4982a2eacd664fc6e75e..4377a8c2cef5aab7a200955cd25830d448014817 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -145,8 +146,14 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
   T b = static_cast<T>(0);
   if (platform::is_cpu_place(self.place())) {
     b = self.data<T>()[offset];
+  } else if (platform::is_xpu_place(self.place())) {
+#ifdef PADDLE_WITH_XPU
+    const T *a = self.data<T>();
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self.place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
     const T *a = self.data<T>();
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
@@ -163,8 +170,14 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
                         "The offset exceeds the size of tensor."));
   if (platform::is_cpu_place(self->place())) {
     self->mutable_data<T>(self->place())[offset] = elem;
+  } else if (platform::is_xpu_place(self->place())) {
+#ifdef PADDLE_WITH_XPU
+    auto p = BOOST_GET_CONST(platform::XPUPlace, self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
+#endif
+  } else if (platform::is_gpu_place(self->place())) {
 #ifdef PADDLE_WITH_CUDA
-  } else {
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
@@ -194,6 +207,16 @@ void SetTensorFromPyArrayT(
       auto dst = self->mutable_data<T>(place);
       std::memcpy(dst, array.data(), array.nbytes());
     }
+  } else if (paddle::platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto dst = self->mutable_data<T>(place);
+    xpu_memcpy(dst, array.data(), array.nbytes(),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
   } else {
 #ifdef PADDLE_WITH_CUDA
     auto dst = self->mutable_data<T>(place);
@@ -211,7 +234,7 @@ void SetTensorFromPyArrayT(
     }
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use CUDAPlace in CPU only version, "
+        "Cannot use CUDAPlace or CUDAPinnedPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
 #endif
   }
@@ -354,8 +377,13 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
   if (platform::is_cpu_place(place)) {
     output->mutable_data(BOOST_GET_CONST(platform::CPUPlace, place),
                          self.type());
-#ifdef PADDLE_WITH_CUDA
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
+                         self.type());
+#endif
   } else {
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
                            self.type());
@@ -516,6 +544,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     return py::array();
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
+  bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -534,11 +563,11 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor) {
     if (!need_deep_copy) {
-      return py::array(py::buffer_info(
-          const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
-          static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
+      auto base = py::cast(std::move(tensor));
+      return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
+                       const_cast<void *>(tensor_buf_ptr), base);
     } else {
       py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
       PADDLE_ENFORCE_EQ(
@@ -557,28 +586,54 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
                            copy_bytes);
       return py_arr;
     }
-  }
-
+  } else if (is_xpu_tensor) {
+#ifdef PADDLE_WITH_XPU
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::XPUPlace, tensor.place());
+    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
+                         tensor_buf_ptr, copy_bytes);
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use XPUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (is_gpu_tensor) {
 #ifdef PADDLE_WITH_CUDA
-  py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-  PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray is not writable, in which case memory leak "
-                        "or double free would occur"));
-  PADDLE_ENFORCE_EQ(py_arr.owndata(), true,
-                    platform::errors::InvalidArgument(
-                        "PyArray does not own data, in which case  memory leak "
-                        "or double free would occur"));
-
-  size_t copy_bytes = sizeof_dtype * numel;
-  paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
-                                  copy_bytes, cudaMemcpyDeviceToHost);
-  return py_arr;
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    paddle::platform::GpuMemcpySync(py_arr.mutable_data(), tensor_buf_ptr,
+                                    copy_bytes, cudaMemcpyDeviceToHost);
+    return py_arr;
 #else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "Cannot use CUDAPlace in CPU only version, "
-      "Please recompile or reinstall Paddle with CUDA support."));
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use CUDAPlace in CPU only version, "
+        "Please recompile or reinstall Paddle with CUDA support."));
 #endif
+  }
+  PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
+  return py::array();
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 04870f87c40dd305304579a454cb618bf1446e39..1f88eb2109aa23b6b60104451908b0a70c41c898 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,6 +29,8 @@ function(train_test TARGET_NAME)
                 PROPERTIES DEPENDS test_${TARGET_NAME})
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES LABELS "RUN_TYPE=DIST")
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES TIMEOUT 150)
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 39db5a601d3d46c106a574870f02434bd4bd5cd1..d7a86b653bec44c260a845d454c771ec4440993b 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
 | `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
 ## Docker Images
 
@@ -155,21 +154,6 @@ docker push
 kubectl ...
 ```
 
-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
-- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-./paddle/scripts/paddle_docker_build.sh html
-```
-
-- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```
 
 ## More Options
 
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 0c96906afb917c2544c9fe4e2172033e84102e4f..c84574b21d883b24e1f89c59c3a724aae6621479 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -1,21 +1,91 @@
-@ECHO OFF
+@ECHO ON
 SETLOCAL
 
+rem Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+rem
+rem Licensed under the Apache License, Version 2.0 (the "License");
+rem you may not use this file except in compliance with the License.
+rem You may obtain a copy of the License at
+rem
+rem     http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+
+rem =================================================
+rem       Paddle CI Task On Windows Platform
+rem =================================================
+
 set work_dir=%cd%
+if exist build rmdir build /s/q
+mkdir build
+cd /d build
+
+rem ------initialize the virtual environment------
+if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
+set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
+
+rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
+rem Now use system python environment temporarily
+rem set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+rem %PYTHON_EXECUTABLE% -m pip install virtualenv
+rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
+rem call paddle_winci\Scripts\activate.bat
+
+rem ------pre install requirement----------
+where python
+where pip
+pip install --upgrade pip --user
+pip install wheel --user
+pip install gym --user
+pip install -U -r %work_dir%\python\requirements.txt --user
+if %ERRORLEVEL% NEQ 0 (
+    call paddle_winci\Scripts\deactivate.bat 2>NUL
+    echo pip install requirements.txt failed!
+    exit /b 7
+)
+
+rem ------initialize common variable------
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
-if not defined PYTHON_ROOT set PYTHON_ROOT=c:\Python27
-if not defined WITH_MKL set WITH_MKL=ON
-if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_AVX set WITH_AVX=ON
-if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
-if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=d:/.cache/inference_demo
-if not defined THIRD_PARTY_PATH set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-dir d:\.cache
+if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+
+rem ------set cache third_party------
+set cache_dir=%work_dir%\..\cache
+dir %cache_dir%
+set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
+
+if not exist %cache_dir%\tools (
+    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
+    if %ERRORLEVEL% NEQ 0 exit /b %ERRORLEVEL%
+)
+
+if "%WITH_TPCACHE%"=="OFF" (
+    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
+    goto :CASE_%1
+)
+
+echo set -ex > cache.sh
+echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
+echo echo ${md5_content}^>md5.txt >> cache.sh
+
+%cache_dir%\tools\busybox64.exe cat cache.sh
+%cache_dir%\tools\busybox64.exe bash cache.sh
+
+set /p md5=< md5.txt
+if "%WITH_GPU%"=="ON" (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5%
+) else (
+    set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5%
+)
 
 goto :CASE_%1
 
@@ -26,6 +96,8 @@ echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
 exit /b 1
 
 :CASE_wincheck_mkl
+set WITH_MKL=ON
+set WITH_GPU=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -35,24 +107,32 @@ call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 :CASE_wincheck_openblas
+set WITH_MKL=OFF
+set WITH_GPU=ON
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 goto:success
 
+rem "Other configurations are added here"
+rem :CASE_wincheck_others
+rem call ...
+
+
 rem ---------------------------------------------------------------------------------------------
 :cmake
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-mkdir build
-cd /d build
-cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
 goto:eof
 
 :cmake_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Cmake failed, will exit!
+exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :build
@@ -60,38 +140,42 @@ echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
-set build_times=1
 
+set build_times=1
 :build_tp
-echo BUILD THIRD_PARTY %build_times%
+echo Build third_party for %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
-echo BUILD THIRD_PARTY RESULT %ERRORLEVEL%
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
     if %build_times% GTR 3 (
-        exit /b 1
+        exit /b 7
     ) else (
+        echo Build third_party failed, will retry!
         goto :build_tp
     )
 )
+echo Build third_party successfully!
 
 set build_times=1
 :build_paddle
-echo BUILD PADDLE %build_times%
-msbuild /m /p:Configuration=Release /verbosity:quiet paddle.sln
-echo BUILD PADDLE RESULT %ERRORLEVEL%
+echo Build Paddle for %build_times% time:
+msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
-        exit /b 1
+        exit /b 7
     ) else (
+        echo Build Paddle failed, will retry!
         goto :build_paddle
     )
 )
+echo Build Paddle successfully!
 goto:eof
 
 :build_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Build Paddle failed, will exit!
+exit /b 7
 
 rem ---------------------------------------------------------------------------------------------
 :test_whl_pacakage
@@ -100,49 +184,61 @@ echo    Step 3. Test pip install whl package ...
 echo    ========================================
 dir /s /b python\dist\*.whl > whl_file.txt
 set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
-%PYTHON_EXECUTABLE% -m pip install -U %PADDLE_WHL_FILE_WIN%
-echo import paddle.fluid;print(paddle.__version__) > test_whl.py
-%PYTHON_EXECUTABLE% test_whl.py
+
+pip uninstall -y paddlepaddle
+pip uninstall -y paddlepaddle-gpu
+pip install -U %PADDLE_WHL_FILE_WIN% --user
+if %ERRORLEVEL% NEQ 0 (
+    call paddle_winci\Scripts\deactivate.bat 2>NUL
+    echo pip install whl package failed!
+    exit /b 3
+)
+
+python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
 
 :test_whl_pacakage_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Test import paddle failed, will exit!
+exit /b 3
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
 echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
-%PYTHON_EXECUTABLE% -m pip install --upgrade pip
-dir %work_dir%\build\third_party\install\openblas\lib
-dir %work_dir%\build\third_party\install\openblas\bin
-dir %work_dir%\build\third_party\install\zlib\bin
-dir %work_dir%\build\third_party\install\mklml\lib
-dir %work_dir%\build\third_party\install\mkldnn\bin
-dir %work_dir%\build\third_party\install\warpctc\bin
-
-set PATH=%work_dir%\build\third_party\install\openblas\lib;%work_dir%\build\third_party\install\openblas\bin;%work_dir%\build\third_party\install\zlib\bin;%work_dir%\build\third_party\install\mklml\lib;%work_dir%\build\third_party\install\mkldnn\bin;%work_dir%\build\third_party\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 10
+
+dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
+dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
+dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
+dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
+dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
+dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
+
+set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+ctest.exe --output-on-failure -C Release -j 8
 goto:eof
 
 :unit_test_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Running unit tests failed, will exit!
+exit /b 8
 
 rem ---------------------------------------------------------------------------------------------
 :test_inference
 echo    ========================================
 echo    Step 5. Testing fluid library for inference ...
 echo    ========================================
-if NOT EXIST "d:\.cache\tools" (
-  git clone https://github.com/zhouwei25/tools.git d:\.cache\tools
-)
+
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
 
-d:\.cache\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% d:/.cache/inference_demo
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
 goto:eof
 
 :test_inference_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo Testing fluid library for inference failed!
+exit /b 5
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
@@ -164,7 +260,7 @@ echo     ============================================ >>  check_change_of_unitte
 echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
 echo     ============================================ >>  check_change_of_unittest.sh
 echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/../paddle/fluid/UNITTEST_PR.spec>>  check_change_of_unittest.sh
+echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
 echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
@@ -179,16 +275,16 @@ echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>>  check_c
 echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
-echo git checkout -b test_pr -t upstream/$BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE:\=\\% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
+echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
 echo cat ^<^<EOF>>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
 echo     ============================================       >>  check_change_of_unittest.sh
 echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/../paddle/fluid/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
+echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/../paddle/fluid/UNITTEST_DEV.spec $(pwd)/../paddle/fluid/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
+echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
 echo if [ "$unittest_spec_diff" != "" ]; then>>  check_change_of_unittest.sh
 echo     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420>>  check_change_of_unittest.sh
 echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
@@ -210,12 +306,13 @@ echo     else>>  check_change_of_unittest.sh
 echo          exit 1 >>  check_change_of_unittest.sh
 echo     fi>>  check_change_of_unittest.sh
 echo fi>>  check_change_of_unittest.sh
-echo git checkout origin_pr >>  check_change_of_unittest.sh
-d:\.cache\tools\busybox64.exe bash check_change_of_unittest.sh
+echo git checkout -f origin_pr >>  check_change_of_unittest.sh
+%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh
 goto:eof
 
 :check_change_of_unittest_error
-exit /b %ERRORLEVEL%
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+exit /b 1
 
 
 rem ---------------------------------------------------------------------------------------------
@@ -233,6 +330,10 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
+taskkill /f /im %cd%\paddle\fluid\pybind\Release\op_function_generator.exe  2>NUL
+taskkill /f /im python.exe  2>NUL
+call paddle_winci\Scripts\deactivate.bat 2>NUL
+taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8684851283f21b0384bc2aa95808c2594726f122..57defebd7575b41c031957aa9c2f848861a006ac 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -195,6 +195,12 @@ function cmake_base() {
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
     grpc_flag=${WITH_GRPC:-${distibuted_flag}}
 
+    if [ "$SYSTEM" == "Darwin" ]; then
+        gloo_flag="OFF"
+    else
+        gloo_flag=${distibuted_flag}
+    fi
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -219,6 +225,7 @@ function cmake_base() {
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
         -DWITH_GRPC=${grpc_flag}
+	    -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
         -DLITE_GIT_TAG=develop
     ========================================
@@ -249,6 +256,7 @@ EOF
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
         -DWITH_GRPC=${grpc_flag} \
+	    -DWITH_GLOO=${gloo_flag} \
         -DLITE_GIT_TAG=develop \
         -DWITH_LITE=${WITH_LITE:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
@@ -521,13 +529,16 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2
+        ctest --output-on-failure -j $2;mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
         # Recovery proxy to avoid failure in later steps
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
+        if [ "$mactest_error" != 0 ];then
+            exit 8;
+        fi
     fi
 }
 
@@ -562,6 +573,7 @@ function generate_upstream_develop_api_spec() {
 }
 
 function generate_api_spec() {
+    set -e
     spec_kind=$2
     if [ "$spec_kind" != "PR" ] && [ "$spec_kind" != "DEV" ]; then
         echo "Not supported $2"
@@ -572,7 +584,8 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
 
@@ -686,6 +699,13 @@ function assert_api_spec_approvals() {
     fi
 }
 
+function assert_file_diff_approvals() {
+    /bin/bash ${PADDLE_ROOT}/tools/check_file_diff_approvals.sh;file_approval_error=$?
+    if [ "$file_approval_error" != 0 ];then
+       exit 6
+    fi
+}
+
 
 function check_coverage() {
     /bin/bash ${PADDLE_ROOT}/tools/coverage/paddle_coverage.sh
@@ -893,7 +913,7 @@ set +x
                         multiple_card_tests="$multiple_card_tests|^$testcase$"
                     fi
                 else
-                    if [[ "${#single_card_tests}" -gt 3000 ]];then
+                    if [[ "${#single_card_tests}" -gt 10000 ]];then
                         if [[ "$single_card_tests_1" == "" ]]; then 
                             single_card_tests_1="^$testcase$"
                         else
@@ -919,17 +939,96 @@ set +x
         card_test "$multiple_card_tests" 2  # run cases with two GPUs
         card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
         collect_failed_tests
-        if [ -n "${failed_test_lists}" ];then
-            failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-            echo "========================================"
-            echo "Summary Failed Tests... "
-            echo "========================================"
-            echo "The following tests FAILED: "
-            echo "${failed_test_lists_ult}"
-        fi
         rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_time_array=('first' 'second' 'third')
+        if [ -n "$failed_test_lists" ];then
+            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                do
+                    
+                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                    failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                    echo "========================================="
+                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                    echo "========================================="
+                    echo "The following unittest will be re-run:"
+                    echo "${failed_test_lists_ult}"
+                        
+                    for line in ${retry_unittests[@]} ;
+                        do
+
+                            one_card_tests=$single_card_tests'|'$single_card_tests_1
+
+                            read tmp_one_tmp <<< "$( echo $one_card_tests | grep -oEi $line )"
+                            read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )"
+                            read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )"
+
+                            if [[ "$tmp_one_tmp" != ""  ]]; then
+                                if [[ "$one_card_retry" == "" ]]; then
+                                    one_card_retry="^$line$"
+                                else
+                                    one_card_retry="$one_card_retry|^$line$"
+                                fi
+                            elif [[ "$tmp_mul_tmp" != "" ]]; then
+                                if [[ "$multiple_card_retry" == "" ]]; then
+                                    multiple_card_retry="^$line$"
+                                else
+                                    multiple_card_retry="$multiple_card_retry|^$line$"
+                                fi
+                            else
+                                if [[ "$exclusive_retry" == "" ]];then
+                                    exclusive_retry="^$line$"
+                                else
+                                    exclusive_retry="$exclusive_retry|^$line$"
+                                fi
+                            fi
+
+                        done
+
+                    if [[ "$one_card_retry" != "" ]]; then
+                        card_test "$one_card_retry" 1
+                    fi
+
+                    if [[ "$multiple_card_retry" != "" ]]; then
+                        card_test "$multiple_card_retry" 2
+                    fi
+
+                    if [[ "$exclusive_retry" != "" ]]; then
+                        card_test "$exclusive_retry"
+                    fi
+                    
+                    exec_times=$[$exec_times+1]
+                    failed_test_lists=''
+                    collect_failed_tests
+                    rm -f $tmp_dir/*
+                    one_card_retry=''
+                    multiple_card_retry=''
+                    exclusive_retry=''
+                    retry_unittests=''
+                done
+        fi
+
+
+       
         if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
+            if [[ "$failed_test_lists" == "" ]]; then
+                echo "========================================"
+                echo "There are failed tests, which have been successful after re-run:"
+                echo "========================================"
+                echo "The following tests have been re-ran:"
+                echo "${retry_unittests_record}"
+            else
+                failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                echo "========================================"
+                echo "Summary Failed Tests... "
+                echo "========================================"
+                echo "The following tests FAILED: "
+                echo "${failed_test_lists_ult}"
+                exit 8;
+            fi
         fi
 set -ex
     fi
@@ -1009,22 +1108,6 @@ EOF
       esac
 }
 
-function gen_html() {
-    cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-    	-b ${PADDLE_ROOT}/build \
-    	-a \
-    	-o $WOBOQ_OUT \
-    	-p paddle:${PADDLE_ROOT}
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-}
 
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
@@ -1260,9 +1343,13 @@ EOF
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
              ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
              ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+    EXIT_CODE=$?
     fluid_endTime_s=`date +%s`
     echo "test_fluid_lib Total Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"          
     ./clean.sh
+    if [[ "$EXIT_CODE" != "0" ]]; then
+        exit 8;
+    fi
 }
 
 function test_fluid_lib_train() {
@@ -1274,9 +1361,13 @@ EOF
     fluid_train_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/train/demo
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON}
+    EXIT_CODE=$?
     fluid_train_endTime_s=`date +%s`
     echo "test_fluid_lib_train Total Time: $[ $fluid_train_endTime_s - $fluid_train_startTime_s ]s"
     ./clean.sh
+    if [[ "$EXIT_CODE" != "0" ]]; then
+        exit 8;
+    fi
 }
 
 function build_document_preview() {
@@ -1339,9 +1430,6 @@ function main() {
       gen_doc_lib)
         gen_doc_lib $2
         ;;
-      html)
-        gen_html
-        ;;
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
@@ -1380,7 +1468,7 @@ function main() {
       test_inference)
         gen_fluid_lib ${parallel_number}
         test_fluid_lib
-        test_fluid_lib_train
+        #test_fluid_lib_train
         ;;
       test_train)
         gen_fluid_lib ${parallel_number}
@@ -1389,6 +1477,9 @@ function main() {
       assert_api_approvals)
         assert_api_spec_approvals
         ;;
+      assert_file_approvals)
+        assert_file_diff_approvals
+        ;; 
       maccheck)
         cmake_gen_and_build_mac ${PYTHON_ABI:-""}
         run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 59dfc5c9d0311342fc72d8400a3abddd3f6d778b..779a6842ebb03e2afcdb7718f77eb9b0d2c09a83 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -75,14 +75,12 @@ IF(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ELSE(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()
 
@@ -93,6 +91,7 @@ set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (WITH_TESTING)
   add_subdirectory(paddle/reader/tests)
   add_subdirectory(paddle/dataset/tests)
+  add_subdirectory(paddle/tests)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 6cc986c61e1db1990cde9598cccd5ee307b31df5..4e1e04043ad7d2fd72bfe891b755a2503c2096b3 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -31,27 +31,33 @@ import paddle.reader
 import paddle.dataset
 import paddle.batch
 batch = batch.batch
+from .fluid import monkey_patch_variable
+from .fluid.dygraph import monkey_patch_math_varbase
+monkey_patch_variable()
+monkey_patch_math_varbase()
+import paddle.framework
+from .framework import VarBase as Tensor
+from .framework import ComplexVariable as ComplexTensor
 import paddle.compat
 import paddle.distributed
 import paddle.sysconfig
 import paddle.tensor
+import paddle.distribution
 import paddle.nn
-import paddle.fleet
-import paddle.framework
-import paddle.imperative
+import paddle.distributed.fleet
 import paddle.optimizer
 import paddle.metric
+import paddle.device
 import paddle.incubate.complex as complex
 
 # TODO: define alias in tensor and framework directory
 
 from .tensor.random import randperm
+from .tensor.random import bernoulli
 
 from .tensor.attribute import rank  #DEFINE_ALIAS
 from .tensor.attribute import shape  #DEFINE_ALIAS
-from .tensor.creation import create_tensor  #DEFINE_ALIAS
-# from .tensor.creation import create_lod_tensor        #DEFINE_ALIAS
-# from .tensor.creation import create_random_int_lodtensor        #DEFINE_ALIAS
+from .tensor.creation import to_tensor  #DEFINE_ALIAS
 from .tensor.creation import crop_tensor  #DEFINE_ALIAS
 from .tensor.creation import diag  #DEFINE_ALIAS
 from .tensor.creation import eye  #DEFINE_ALIAS
@@ -69,8 +75,6 @@ from .tensor.creation import full_like  #DEFINE_ALIAS
 from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
-from .tensor.io import save  #DEFINE_ALIAS
-from .tensor.io import load  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
@@ -87,7 +91,7 @@ from .tensor.logic import equal  #DEFINE_ALIAS
 from .tensor.logic import greater_equal  #DEFINE_ALIAS
 from .tensor.logic import greater_than  #DEFINE_ALIAS
 from .tensor.logic import is_empty  #DEFINE_ALIAS
-from .tensor.logic import isfinite  #DEFINE_ALIAS
+#from .tensor.logic import isfinite  #DEFINE_ALIAS
 from .tensor.logic import less_equal  #DEFINE_ALIAS
 from .tensor.logic import less_than  #DEFINE_ALIAS
 from .tensor.logic import logical_and  #DEFINE_ALIAS
@@ -103,7 +107,9 @@ from .tensor.logic import equal_all  #DEFINE_ALIAS
 from .tensor.manipulation import cast  #DEFINE_ALIAS
 from .tensor.manipulation import concat  #DEFINE_ALIAS
 from .tensor.manipulation import expand  #DEFINE_ALIAS
+from .tensor.manipulation import broadcast_to  #DEFINE_ALIAS
 from .tensor.manipulation import expand_as  #DEFINE_ALIAS
+from .tensor.manipulation import tile  #DEFINE_ALIAS
 from .tensor.manipulation import flatten  #DEFINE_ALIAS
 from .tensor.manipulation import gather  #DEFINE_ALIAS
 from .tensor.manipulation import gather_nd  #DEFINE_ALIAS
@@ -126,6 +132,7 @@ from .tensor.manipulation import unstack  #DEFINE_ALIAS
 from .tensor.manipulation import flip  #DEFINE_ALIAS
 from .tensor.manipulation import unbind  #DEFINE_ALIAS
 from .tensor.manipulation import roll  #DEFINE_ALIAS
+from .tensor.manipulation import chunk  #DEFINE_ALIAS
 from .tensor.math import abs  #DEFINE_ALIAS
 from .tensor.math import acos  #DEFINE_ALIAS
 from .tensor.math import asin  #DEFINE_ALIAS
@@ -137,8 +144,6 @@ from .tensor.math import cumsum  #DEFINE_ALIAS
 from .tensor.math import elementwise_add  #DEFINE_ALIAS
 from .tensor.math import elementwise_div  #DEFINE_ALIAS
 from .tensor.math import elementwise_floordiv  #DEFINE_ALIAS
-from .tensor.math import elementwise_max  #DEFINE_ALIAS
-from .tensor.math import elementwise_min  #DEFINE_ALIAS
 from .tensor.math import elementwise_mod  #DEFINE_ALIAS
 from .tensor.math import elementwise_pow  #DEFINE_ALIAS
 from .tensor.math import elementwise_sub  #DEFINE_ALIAS
@@ -167,9 +172,15 @@ from .tensor.math import sums  #DEFINE_ALIAS
 from .tensor.math import tanh  #DEFINE_ALIAS
 from .tensor.math import elementwise_sum  #DEFINE_ALIAS
 from .tensor.math import max  #DEFINE_ALIAS
+from .tensor.math import maximum  #DEFINE_ALIAS
 from .tensor.math import min  #DEFINE_ALIAS
+from .tensor.math import minimum  #DEFINE_ALIAS
 from .tensor.math import mm  #DEFINE_ALIAS
-from .tensor.math import div  #DEFINE_ALIAS
+from .tensor.math import divide  #DEFINE_ALIAS
+from .tensor.math import floor_divide  #DEFINE_ALIAS
+from .tensor.math import remainder  #DEFINE_ALIAS
+from .tensor.math import mod  #DEFINE_ALIAS
+from .tensor.math import floor_mod  #DEFINE_ALIAS
 from .tensor.math import multiply  #DEFINE_ALIAS
 from .tensor.math import add  #DEFINE_ALIAS
 from .tensor.math import atan  #DEFINE_ALIAS
@@ -179,11 +190,16 @@ from .tensor.math import log1p  #DEFINE_ALIAS
 from .tensor.math import erf  #DEFINE_ALIAS
 from .tensor.math import addcmul  #DEFINE_ALIAS
 from .tensor.math import addmm  #DEFINE_ALIAS
-from .tensor.math import clamp  #DEFINE_ALIAS
+from .tensor.math import clip  #DEFINE_ALIAS
 from .tensor.math import trace  #DEFINE_ALIAS
 from .tensor.math import kron  #DEFINE_ALIAS
-# from .tensor.random import gaussin        #DEFINE_ALIAS
-# from .tensor.random import uniform        #DEFINE_ALIAS
+from .tensor.math import isfinite  #DEFINE_ALIAS
+from .tensor.math import isinf  #DEFINE_ALIAS
+from .tensor.math import isnan  #DEFINE_ALIAS
+from .tensor.math import prod  #DEFINE_ALIAS
+from .tensor.random import standard_normal
+from .tensor.random import normal
+from .tensor.random import uniform  #DEFINE_ALIAS
 from .tensor.random import shuffle  #DEFINE_ALIAS
 from .tensor.random import randn  #DEFINE_ALIAS
 from .tensor.random import rand  #DEFINE_ALIAS
@@ -194,49 +210,61 @@ from .tensor.search import argmin  #DEFINE_ALIAS
 from .tensor.search import argsort  #DEFINE_ALIAS
 from .tensor.search import has_inf  #DEFINE_ALIAS
 from .tensor.search import has_nan  #DEFINE_ALIAS
-# from .tensor.search import masked_select        #DEFINE_ALIAS
+from .tensor.search import masked_select  #DEFINE_ALIAS
 from .tensor.search import topk  #DEFINE_ALIAS
 from .tensor.search import where  #DEFINE_ALIAS
 from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 from .framework.random import manual_seed  #DEFINE_ALIAS
-from .framework import append_backward  #DEFINE_ALIAS
-from .framework import gradients  #DEFINE_ALIAS
-from .framework import Executor  #DEFINE_ALIAS
-from .framework import global_scope  #DEFINE_ALIAS
-from .framework import scope_guard  #DEFINE_ALIAS
-from .framework import BuildStrategy  #DEFINE_ALIAS
-from .framework import CompiledProgram  #DEFINE_ALIAS
-from .framework import default_main_program  #DEFINE_ALIAS
-from .framework import default_startup_program  #DEFINE_ALIAS
+from .framework import Variable  #DEFINE_ALIAS
+from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_global_var  #DEFINE_ALIAS
 from .framework import create_parameter  #DEFINE_ALIAS
-from .framework import Print  #DEFINE_ALIAS
-from .framework import py_func  #DEFINE_ALIAS
-from .framework import ExecutionStrategy  #DEFINE_ALIAS
-from .framework import name_scope  #DEFINE_ALIAS
-from .framework import ParallelExecutor  #DEFINE_ALIAS
-from .framework import ParamAttr  #DEFINE_ALIAS
-from .framework import Program  #DEFINE_ALIAS
-from .framework import program_guard  #DEFINE_ALIAS
-from .framework import Variable  #DEFINE_ALIAS
-from .framework import WeightNormParamAttr  #DEFINE_ALIAS
 from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
+
+from .framework import BackwardStrategy  #DEFINE_ALIAS
+from .framework import to_variable  #DEFINE_ALIAS
+from .framework import grad  #DEFINE_ALIAS
+from .framework import no_grad  #DEFINE_ALIAS
+from .framework import save  #DEFINE_ALIAS
+from .framework import load  #DEFINE_ALIAS
+from .framework import prepare_context  #DEFINE_ALIAS
+from .framework import ParallelEnv  #DEFINE_ALIAS
+from .framework import DataParallel  #DEFINE_ALIAS
+
+from .framework import NoamDecay  #DEFINE_ALIAS
+from .framework import PiecewiseDecay  #DEFINE_ALIAS
+from .framework import NaturalExpDecay  #DEFINE_ALIAS
+from .framework import ExponentialDecay  #DEFINE_ALIAS
+from .framework import InverseTimeDecay  #DEFINE_ALIAS
+from .framework import PolynomialDecay  #DEFINE_ALIAS
+from .framework import CosineDecay  #DEFINE_ALIAS
+from .framework import set_default_dtype  #DEFINE_ALIAS
+from .framework import get_default_dtype  #DEFINE_ALIAS
+
 from .tensor.search import index_sample  #DEFINE_ALIAS
 from .tensor.stat import mean  #DEFINE_ALIAS
 from .tensor.stat import reduce_mean  #DEFINE_ALIAS
 from .tensor.stat import std  #DEFINE_ALIAS
 from .tensor.stat import var  #DEFINE_ALIAS
 from .fluid.data import data
+from .tensor.stat import numel  #DEFINE_ALIAS
+from .device import get_cudnn_version
+from .device import set_device
+from .device import get_device
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
 
 from . import incubate
 from .incubate import hapi
-from .fluid.dygraph.base import enable_dygraph as enable_imperative  #DEFINE_ALIAS
-from .fluid.dygraph.base import disable_dygraph as disable_imperative  #DEFINE_ALIAS
-from .fluid.framework import in_dygraph_mode as in_imperative_mode  #DEFINE_ALIAS
+from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
+from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
+from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
+from .fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+
+from . import jit
+from . import static
diff --git a/python/paddle/declarative/__init__.py b/python/paddle/declarative/__init__.py
deleted file mode 100644
index 0f28cc7f424d5f77f9080dae89f1ec5fa6adb760..0000000000000000000000000000000000000000
--- a/python/paddle/declarative/__init__.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    'fc',
-    'batch_norm',
-    'embedding',
-    'bilinear_tensor_product'
-    'conv2d'
-    'conv2d_transpose'
-    'conv3d'
-    'conv3d_transpose'
-    'create_parameter'
-    'crf_decoding'
-    'data_norm'
-    'deformable_conv'
-    'group_norm'
-    'hsigmoid'
-    'instance_norm'
-    'layer_norm'
-    'multi_box_head'
-    'nce'
-    'prelu'
-    'row_conv'
-    'spectral_norm',
-]
-
-from ..fluid.layers import fc, batch_norm, bilinear_tensor_product, \
-        conv2d, conv2d_transpose, conv3d, conv3d_transpose, create_parameter, \
-        crf_decoding, data_norm, deformable_conv, group_norm, hsigmoid, instance_norm, \
-        layer_norm, multi_box_head, nce, prelu, row_conv, spectral_norm
-
-from ..fluid.input import embedding
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 894ee5b9e8b1debb2c043de30314e8ebb94d3bc0..e2ef8e7092ad3f6af91c8d5d3c0b1deaed025514 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -13,10 +13,119 @@
 # limitations under the License.
 
 # TODO: define the functions to manipulate devices 
-# __all__ = ['cpu_places',
-#            'CPUPlace',
-#            'cuda_pinned_places',
-#            'cuda_places',
-#            'CUDAPinnedPlace',
-#            'CUDAPlace',
-#            'is_compiled_with_cuda']
+from paddle.fluid import core
+from paddle.fluid import framework
+import re
+
+__all__ = [
+    'get_cudnn_version',
+    'set_device',
+    'get_device'
+    #            'cpu_places',
+    #            'CPUPlace',
+    #            'cuda_pinned_places',
+    #            'cuda_places',
+    #            'CUDAPinnedPlace',
+    #            'CUDAPlace',
+    #            'is_compiled_with_cuda'
+]
+
+_cudnn_version = None
+
+
+def get_cudnn_version():
+    """
+    This funciton return the version of cudnn. the retuen value is int which represents the 
+    cudnn version. For example, if it return 7600, it represents the version of cudnn is 7.6.
+    
+    Returns:
+        int: A int value which represents the cudnn version. If cudnn version is not installed, it return None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+
+            cudnn_version = get_cudnn_version()
+
+
+
+    """
+    global _cudnn_version
+    if not core.is_compiled_with_cuda():
+        return None
+    if _cudnn_version is None:
+        cudnn_version = int(core.cudnn_version())
+        _cudnn_version = cudnn_version
+        if _cudnn_version < 0:
+            return None
+        else:
+            return cudnn_version
+    else:
+        return _cudnn_version
+
+
+def set_device(device):
+    """
+    Paddle supports running calculations on various types of devices, including CPU and GPU.
+    They are represented by string identifiers. This function can specify the global device
+    which the OP will run.
+
+    Parameters:
+        device(str): This parameter determines the specific running device.
+            It can be ``cpu`` or ``gpu:0``. When ``device`` is ``cpu``, the
+            program is running on the cpu. When ``device`` is ``gpu``, the
+            program is running ont the gpu.
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+        paddle.enable_imperative()
+        paddle.fluid.dygraph.set_device("gpu:0")
+        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
+        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
+        data = paddle.stack([x1,x2], axis=1)
+    """
+    lower_device = device.lower()
+    if lower_device == 'cpu':
+        place = core.CPUPlace()
+        framework._set_expected_place(place)
+    else:
+        avaliable_device = ((lower_device == 'cpu') or
+                            re.match(r'gpu:\d+', lower_device))
+        if not avaliable_device:
+            raise ValueError(
+                "The device must be a string which is like 'cpu' or 'gpu:0'")
+        device_info_list = device.split(':', 1)
+        device_id = device_info_list[1]
+        device_id = int(device_id)
+        place = core.CUDAPlace(device_id)
+        framework._set_expected_place(place)
+
+
+def get_device():
+    """
+    This funciton can get the current global device of the program is running.
+    It's a string which is like 'cpu' and 'gpu:0'. if the global device is not
+    set, it will return a string which is 'gpu:0' when cuda is avaliable or it 
+    will return a string which is 'cpu' when cuda is not avaliable.
+
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+        paddle.enable_imperative()
+        device = paddle.fluid.dygraph.get_device()
+
+    """
+    device = ''
+    place = framework._current_expected_place()
+    if isinstance(place, core.CPUPlace):
+        device = 'cpu'
+    elif isinstance(place, core.CUDAPlace):
+        device_id = place.get_device_id()
+        device = 'gpu:' + str(device_id)
+
+    return device
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index d0c32e26092f6ea25771279418582a24ea449ab2..34dd605f901b4357682dc514d59d110db74f9d5b 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .collective import *
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..c40ae7179395a2fc16ece0d68546221ce53c2180
--- /dev/null
+++ b/python/paddle/distributed/collective.py
@@ -0,0 +1,447 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
+from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.layers.tensor import fill_constant
+from ..fluid.layers import utils
+from ..fluid.dygraph.parallel import prepare_context
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+__all__ = [
+    'broadcast',
+    'all_reduce',
+    'reduce',
+    'all_gather',
+    'scatter',
+    'barrier',
+    'ReduceOp',
+]
+
+
+class ReduceOp:
+    """Reduce Operation"""
+    SUM = 0
+    MAX = 1
+    MIN = 2
+    PROD = 3
+
+
+class _Group():
+    """The abstract representation of group."""
+
+    def __init__(self, rank, rank_num):
+        self.rank = rank
+        self.nranks = rank_num
+
+
+_default_group = _Group(
+    int(os.getenv("PADDLE_TRAINER_ID", "0")),
+    int(os.getenv("PADDLE_TRAINERS_NUM", "1")))
+
+
+def broadcast(tensor, src, group=0):
+    """
+
+    Broadcast a tensor from the source to all others.
+
+    Args:
+        tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank.
+        group (int): The process group to work on. It is Optional.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.broadcast(data, 1)
+        out = data.numpy()
+        # [[1, 2, 3], [1, 2, 3]]
+    """
+    if in_dygraph_mode():
+        return core.ops.c_broadcast(tensor, tensor, 'root', src,
+                                    'use_calc_stream', True, 'ring_id', group)
+
+    op_type = 'c_broadcast'
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'broadcast')
+    if not isinstance(src, int) or not isinstance(group, int):
+        raise ValueError("Both the type of 'src' and 'group' for broadcast "
+                         "should be int.")
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'root': src,
+            'use_calc_stream': True,
+            'ring_id': group,
+        })
+
+
+def all_reduce(tensor, op=ReduceOp.SUM, group=0):
+    """
+
+    Reduce a tensor over all ranks so that all get the result.
+
+    Args:
+        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
+        group (int): Optional. The process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        from paddle.distributed import ReduceOp
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.all_reduce(data)
+        out = data.numpy()
+        # [[5, 7, 9], [5, 7, 9]]
+    """
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.MAX:
+            return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.MIN:
+            return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
+                                            True, 'ring_id', group)
+        elif op == ReduceOp.PROD:
+            return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
+                                             True, 'ring_id', group)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'all_reduce')
+    if not op in [ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN, ReduceOp.PROD]:
+        raise ValueError("The op for all_reduce must be one of educeOp.PROD, "
+                         "ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN.")
+    if op == ReduceOp.SUM:
+        op_type = 'c_allreduce_sum'
+    elif op == ReduceOp.MAX:
+        op_type = 'c_allreduce_max'
+    elif op == ReduceOp.MIN:
+        op_type = 'c_allreduce_min'
+    elif op == ReduceOp.PROD:
+        op_type = 'c_allreduce_prod'
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for all_reduce should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': group,
+               'use_calc_stream': True})
+
+
+def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
+    """
+
+    Reduce a tensor to the destination from all others.
+
+    Args:
+        tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
+            should be float16, float32, float64, int32 or int64.
+        dst (int): The destination rank id.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data = np.array([[4, 5, 6], [4, 5, 6]])
+        else:
+            np_data = np.array([[1, 2, 3], [1, 2, 3]])
+        data = paddle.to_tensor(np_data)
+        paddle.distributed.reduce(data, 0)
+        out = data.numpy()
+        # [[5, 7, 9], [5, 7, 9]]
+    """
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.MAX:
+            return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.MIN:
+            return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
+                                         True, 'ring_id', group, 'root_id', dst)
+        elif op == ReduceOp.PROD:
+            return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
+                                          True, 'ring_id', group, 'root_id',
+                                          dst)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    op_type = 'c_reduce'
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'all_reduce')
+    if not op in [ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN, ReduceOp.PROD]:
+        raise ValueError("The op for reduce must be one of educeOp.PROD, "
+                         "ReduceOp.SUM, ReduceOp.MAX, ReduceOp.MIN.")
+
+    if op == ReduceOp.SUM:
+        op_type = 'c_reduce_sum'
+    elif op == ReduceOp.MAX:
+        op_type = 'c_reduce_max'
+    elif op == ReduceOp.MIN:
+        op_type = 'c_reduce_min'
+    elif op == ReduceOp.PROD:
+        op_type = 'c_reduce_prod'
+
+    if not isinstance(dst, int) or not isinstance(group, int):
+        raise ValueError("Both the type of 'dst' and 'group' for reduce "
+                         "should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': group,
+            'use_calc_stream': True,
+            'root_id': dst,
+        })
+
+
+def all_gather(tensor_list, tensor, group=0):
+    """
+
+    Gather tensors from all participators and all get the result.
+
+    Args:
+        tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        tensor (Tensor): The Tensor to send. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        tensor_list = []
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
+            np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            paddle.distributed.all_gather(tensor_list, data1)
+        else:
+            np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
+            np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            out = paddle.distributed.all_gather(tensor_list, data2)
+    """
+    op_type = 'c_allgather'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+    if in_dygraph_mode():
+        core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id',
+                             group, 'nranks', _default_group.nranks)
+    else:
+        if not isinstance(tensor_list, list):
+            raise ValueError("The type of 'tensor_list' for all_gather "
+                             "should be list.")
+        for elem in tensor_list:
+            check_variable_and_dtype(
+                elem, 'tensor_list',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'all_gather')
+        check_variable_and_dtype(
+            tensor, 'tensor',
+            ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
+        if not isinstance(group, int):
+            raise ValueError("The type of 'group' for all_gather "
+                             "should be int.")
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': group,
+                'use_calc_stream': True,
+                'nranks': _default_group.nranks
+            })
+
+    tensor_list.extend(paddle.split(out, _default_group.nranks, 0))
+
+
+def scatter(tensor, tensor_list=None, src=0, group=0):
+    """
+
+    Scatter a tensor to all participators.
+
+    Args:
+        tensor (Tensor): The output Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank id.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        if paddle.ParallelEnv().local_rank == 0:
+            np_data1 = np.array([7, 8, 9])
+            np_data2 = np.array([10, 11, 12])
+        else:
+            np_data1 = np.array([1, 2, 3])
+            np_data2 = np.array([4, 5, 6])
+        data1 = paddle.to_tensor(np_data1)
+        data2 = paddle.to_tensor(np_data2)
+        if paddle.ParallelEnv().local_rank == 0:
+            paddle.distributed.scatter(data1, src=1)
+        else:
+            paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
+        out = data1.numpy()
+    """
+    op_type = 'c_scatter'
+    global _default_group
+    rank = _default_group.rank
+    nranks = _default_group.nranks
+    if rank != src:
+        tensor_list = []
+        for _ in range(nranks):
+            tensor_list.append(tensor)
+    temp = paddle.concat(tensor_list, axis=0)
+    if in_dygraph_mode():
+        return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True,
+                                  'ring_id', group, 'nranks',
+                                  _default_group.nranks, 'root', src)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'scatter')
+    if not isinstance(group, int) or not isinstance(src, int):
+        raise ValueError("Both the type of 'src' and 'group' for scatter "
+                         "should be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': group,
+            'root': src,
+            'use_calc_stream': True,
+            'nranks': nranks,
+        })
+
+
+def barrier(group=0):
+    """
+
+    Barrier among all participators in the group.
+
+    Args:
+        group (int): The id of the process group to work on.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import paddle.prepare_context as prepare_context
+
+        paddle.disable_static()
+        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
+        prepare_context()
+        paddle.distributed.barrier()
+    """
+    op_type = 'barrier'
+    temp = paddle.fill_constant([1], dtype="int32", value="1")
+    if in_dygraph_mode():
+        return core.ops.barrier(temp, temp, 'ring_id', group)
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for barrier must be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': group})
diff --git a/python/paddle/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
similarity index 75%
rename from python/paddle/fleet/__init__.py
rename to python/paddle/distributed/fleet/__init__.py
index b25c362ce9301c122d2e2b6915e444da6a90ceca..b080fb17553d4a93a545f4ae781d786d82e26576 100644
--- a/python/paddle/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -12,14 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define distributed api under this directory, 
+# TODO: define distributed api under this directory,
+from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker
 from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
+from .dataset import *
 
-#from .base.role_maker import PaddleCloudRoleMaker
-
-__all__ = ["DistributedStrategy", "UtilBase"]
+__all__ = [
+    "DistributedStrategy",
+    "UtilBase",
+    "DatasetFactory",
+    "DatasetBase",
+    "InMemoryDataset",
+    "QueueDataset",
+    "UserDefinedRoleMaker",
+    "PaddleCloudRoleMaker",
+]
 
 fleet = Fleet()
 init = fleet.init
@@ -39,4 +48,6 @@ init_server = fleet.init_server
 run_server = fleet.run_server
 stop_worker = fleet.stop_worker
 distributed_optimizer = fleet.distributed_optimizer
+save_inference_model = fleet.save_inference_model
+save_persistables = fleet.save_persistables
 minimize = fleet.minimize
diff --git a/python/paddle/fleet/base/__init__.py b/python/paddle/distributed/fleet/base/__init__.py
similarity index 100%
rename from python/paddle/fleet/base/__init__.py
rename to python/paddle/distributed/fleet/base/__init__.py
diff --git a/python/paddle/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
old mode 100644
new mode 100755
similarity index 64%
rename from python/paddle/fleet/base/distributed_strategy.py
rename to python/paddle/distributed/fleet/base/distributed_strategy.py
index 43e50ca0bee6b324655f7dcfb5e5da2ebc0e85a8..bc6ce8c5e1c3f750318eb105729b88617af5d578
--- a/python/paddle/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-from paddle.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable
+from paddle.distributed.fleet.proto import distributed_strategy_pb2
+from paddle.fluid.framework import Variable, set_flags, core
 import google.protobuf.text_format
 
 
@@ -81,6 +81,8 @@ class DistributedJobInfo(object):
 
 
 class DistributedStrategy(object):
+    __lock_attr = False
+
     def __init__(self):
         """
         DistributedStrategy is the main configuration entry for distributed training of Paddle.
@@ -95,6 +97,13 @@ class DistributedStrategy(object):
 
         """
         self.strategy = distributed_strategy_pb2.DistributedStrategy()
+        self.__lock_attr = True
+
+    def __setattr__(self, key, value):
+        if self.__lock_attr and not hasattr(self, key):
+            raise TypeError("%s is not a attribute of %s" %
+                            (key, self.__class__.__name__))
+        object.__setattr__(self, key, value)
 
     def save_to_prototxt(self, output):
         """
@@ -103,7 +112,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.dgc = True
             strategy.recompute = True
@@ -120,7 +129,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.load_from_prototxt("dist_strategy.protoxt")
         """
@@ -141,7 +150,7 @@ class DistributedStrategy(object):
             exe_strategy.num_iteration_per_drop_scope = 10
             exe_strategy.num_iteration_per_run = 10
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.execution_strategy = exe_strategy
         """
         execution_strategy = paddle.fluid.ExecutionStrategy()
@@ -178,7 +187,7 @@ class DistributedStrategy(object):
             build_strategy.fuse_all_optimizer_ops = True
             build_strategy.enable_inplace = True
             
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.build_strategy = build_strategy
         """
 
@@ -211,7 +220,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             role_maker = fleet.PaddleCloudRoleMaker()
             fleet.init(role_maker)
 
@@ -227,8 +236,11 @@ class DistributedStrategy(object):
     def a_sync(self, flag):
         if isinstance(flag, bool):
             self.strategy.a_sync = flag
+            self.a_sync_configs = {"k_steps": 0}
         else:
-            print("WARNING: a_sync should have value of bool type")
+            raise ValueError(
+                "The type of `flag` is invalid, expected type is bool, but received %s".
+                format(type(flag)))
 
     @property
     def a_sync_configs(self):
@@ -250,7 +262,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             role_maker = fleet.PaddleCloudRoleMaker()
             fleet.init(role_maker)
 
@@ -279,7 +291,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.amp = True # by default this is false
 
@@ -295,6 +307,30 @@ class DistributedStrategy(object):
 
     @property
     def amp_configs(self):
+        """
+        Set automatic mixed precision training configurations. In general, amp has serveral configurable
+        settings that can be configured through a dict.
+
+        **Notes**:
+            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
+            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
+            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
+            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
+            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.amp = True
+            strategy.amp_configs = {
+                "init_loss_scaling": 32768,
+                "custom_white_list": ['conv2d']}
+        """
         return get_msg_dict(self.strategy.amp_configs)
 
     @amp_configs.setter
@@ -311,7 +347,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
             # suppose x and y are names of checkpoint tensors for recomputation
@@ -321,6 +357,17 @@ class DistributedStrategy(object):
 
     @property
     def sync_nccl_allreduce(self):
+        """
+        Indicating whether we are using synchronized all reduce in each communication thread
+        We note that system overhead is usually lower when sync_nccl_allreduce = True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sync_nccl_allreduce = True
+        """
         return self.strategy.sync_nccl_allreduce
 
     @sync_nccl_allreduce.setter
@@ -332,6 +379,18 @@ class DistributedStrategy(object):
 
     @property
     def use_hierarchical_allreduce(self):
+        """
+        Indicating whether we are using hierarchical allreduce in collective communication
+        Hierarchical allreduce often does allreduce within a certain node group and then do
+        allreduce among the leaders of each group
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.use_hierarchical_allreduce = True
+        """
         return self.strategy.use_hierarchical_allreduce
 
     @use_hierarchical_allreduce.setter
@@ -345,6 +404,17 @@ class DistributedStrategy(object):
 
     @property
     def hierarchical_allreduce_inter_nranks(self):
+        """
+        Number of ranks for low level node groups in hierarchical allreduce
+        Default value: number of GPU cards on each single GPU machine
+
+        Example:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.hierarchical_allreduce_inter_nranks = 8
+        """
         return self.strategy.hierarchical_allreduce_inter_nranks
 
     @hierarchical_allreduce_inter_nranks.setter
@@ -358,6 +428,19 @@ class DistributedStrategy(object):
 
     @property
     def sync_batch_norm(self):
+        """
+        Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
+        
+        Default value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sync_batch_norm = True
+        """
+
         return self.strategy.sync_batch_norm
 
     @sync_batch_norm.setter
@@ -369,6 +452,17 @@ class DistributedStrategy(object):
 
     @property
     def fuse_all_reduce_ops(self):
+        """
+        Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
+        Default value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_all_reduce_ops = False
+        """
         return self.strategy.fuse_all_reduce_ops
 
     @fuse_all_reduce_ops.setter
@@ -380,6 +474,18 @@ class DistributedStrategy(object):
 
     @property
     def fuse_grad_size_in_MB(self):
+        """
+        Specifying the size of gradient to fuse in Mega-Bytes
+
+        Default value: 32
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_grad_size_in_MB = 50
+        """
         return self.strategy.fuse_grad_size_in_MB
 
     @fuse_grad_size_in_MB.setter
@@ -404,6 +510,19 @@ class DistributedStrategy(object):
 
     @property
     def nccl_comm_num(self):
+        """
+        Specifying the number of NCCL communicator
+
+        Default value: 1
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.nccl_comm_num = 2
+        """
+
         return self.strategy.nccl_comm_num
 
     @nccl_comm_num.setter
@@ -429,7 +548,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
             strategy.recompute_configs = {"checkpionts": ["x", "y"]}
@@ -454,7 +573,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
 
@@ -487,7 +606,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
         
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
             strategy.pipeline_configs = {"micro_batch": 12}
@@ -525,6 +644,20 @@ class DistributedStrategy(object):
 
     @property
     def dgc(self):
+        """
+        Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
+        [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.dgc = True # by default this is false
+
+        """
         return self.strategy.dgc
 
     @dgc.setter
@@ -536,6 +669,28 @@ class DistributedStrategy(object):
 
     @property
     def dgc_configs(self):
+        """
+        Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
+        settings that can be configured through a dict.
+
+        **Notes**:
+            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
+            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
+                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
+                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
+                ends, it will use 0.999 then and after.
+            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
+                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
+                element will be transmitted.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.dgc = True
+            strategy.dgc_configs = {"rampup_begin_step": 1252}
+        """
         return get_msg_dict(self.strategy.dgc_configs)
 
     @dgc_configs.setter
@@ -557,7 +712,7 @@ class DistributedStrategy(object):
 
         Examples:
         .. code-block:: python
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
             strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
@@ -580,7 +735,7 @@ class DistributedStrategy(object):
             avg (bool): whether to average the gradients of each mini-batch,
                 the default value is `True`
         Example:
-            import paddle.fleet as fleet
+            import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
             strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
@@ -595,6 +750,20 @@ class DistributedStrategy(object):
 
     @property
     def lars(self):
+        """
+        Set lars configurations. lars is used to deal with the convergence problems when the global 
+        batch size is larger than 8k.  For more details, please refer to 
+        [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
+
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True # by default this is false
+        """
         return self.strategy.lars
 
     @lars.setter
@@ -606,6 +775,29 @@ class DistributedStrategy(object):
 
     @property
     def lars_configs(self):
+        """
+        Set Lars training configurations.
+
+        **Notes**:
+        **lars_coeff (float)**: trust ratio in lars formula.
+        **lars_weight_decay** (float): weight decay coefficient in lars formula.
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
+        when compute the local lr; 
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lars formula.
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True
+            strategy.lars_configs = {
+                        "lars_coeff": 0.01,
+                        "lars_weight_decay": 0.0005,
+                        "epsilon": 0,
+                        "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                    }
+        """
         return get_msg_dict(self.strategy.lars_configs)
 
     @lars_configs.setter
@@ -615,6 +807,22 @@ class DistributedStrategy(object):
 
     @property
     def lamb(self):
+        """
+        Set lamb configurations. lamb is used to deal with the convergence problems for large 
+        batch size training, specially for attention-related model like BERT. For more details, 
+        please refer to 
+        [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
+
+        Default Value: False
+        
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True # by default this is false
+        """
+
         return self.strategy.lamb
 
     @lamb.setter
@@ -624,6 +832,33 @@ class DistributedStrategy(object):
         else:
             print("WARNING: lamb should have value of bool type")
 
+    @property
+    def lamb_configs(self):
+        """
+        Set Lars training configurations.
+
+        **Notes**:
+        **lamb_weight_decay** (float): weight decay coefficient in lamb formula.
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lamb formula.
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                    'lamb_weight_decay': 0.01,
+                    'exclude_from_weight_decay': [],
+                }
+        """
+        return get_msg_dict(self.strategy.lamb_configs)
+
+    @lamb_configs.setter
+    def lamb_configs(self, configs):
+        check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs")
+        assign_configs_value(self.strategy.lamb_configs, configs)
+
     @property
     def elastic(self):
         return self.strategy.elastic
@@ -646,6 +881,68 @@ class DistributedStrategy(object):
         else:
             print("WARNING: auto should have value of bool type")
 
+    @property
+    def cudnn_exhaustive_search(self):
+        return self.strategy.cudnn_exhaustive_search
+
+    @cudnn_exhaustive_search.setter
+    def cudnn_exhaustive_search(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_exhaustive_search = flag
+        else:
+            print(
+                "WARNING: cudnn_exhaustive_search should have value of bool type"
+            )
+
+    @property
+    def conv_workspace_size_limit(self):
+        return self.strategy.conv_workspace_size_limit
+
+    @conv_workspace_size_limit.setter
+    def conv_workspace_size_limit(self, value):
+        if isinstance(value, int):
+            self.strategy.conv_workspace_size_limit = value
+        else:
+            print(
+                "WARNING: conv_workspace_size_limit should have value of int type"
+            )
+
+    @property
+    def cudnn_batchnorm_spatial_persistent(self):
+        return self.strategy.cudnn_batchnorm_spatial_persistent
+
+    @cudnn_batchnorm_spatial_persistent.setter
+    def cudnn_batchnorm_spatial_persistent(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.cudnn_batchnorm_spatial_persistent = flag
+        else:
+            print(
+                "WARNING: cudnn_batchnorm_spatial_persistent should have value of bool type"
+            )
+
+    def _enable_env(self):
+        strategy = self.strategy
+        keys = [
+            "FLAGS_cudnn_batchnorm_spatial_persistent",
+            "FLAGS_conv_workspace_size_limit",
+            "FLAGS_cudnn_exhaustive_search",
+            "FLAGS_sync_nccl_allreduce",
+            "FLAGS_fuse_parameter_memory_size",
+            "FLAGS_fuse_parameter_groups_size",
+        ]
+        values = [
+            bool(strategy.cudnn_batchnorm_spatial_persistent),
+            int(strategy.conv_workspace_size_limit),
+            bool(strategy.cudnn_exhaustive_search),
+            bool(strategy.sync_nccl_allreduce),
+            int(strategy.fuse_grad_size_in_MB),
+            int(strategy.fuse_grad_size_in_TFLOPS),
+        ]
+
+        for i, key in enumerate(keys):
+            if core.globals().is_public(key):
+                core.globals()[key] = values[i]
+
     def __repr__(self):
         fields = self.strategy.DESCRIPTOR.fields
         for f in fields:
diff --git a/python/paddle/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
similarity index 77%
rename from python/paddle/fleet/base/fleet_base.py
rename to python/paddle/distributed/fleet/base/fleet_base.py
index 979b878a3df966a3af59cee126b884361f5b6ac7..a6286bcca87fad1afddbd8af1e56dad05dab2578 100644
--- a/python/paddle/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -14,14 +14,32 @@
 
 from __future__ import print_function
 import paddle
+from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
 from .strategy_compiler import StrategyCompiler
+from .distributed_strategy import DistributedStrategy
 from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
+from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = ['Fleet']
 
 
+def _inited_runtime_handler_(func):
+    def __impl__(*args, **kwargs):
+        cls = args[0]
+
+        if cls._runtime_handle is None:
+            raise ValueError("Fleet can not find suitable runtime handler")
+
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
+
+
 class Fleet(object):
     """
     Unified API for distributed training of PaddlePaddle
@@ -34,9 +52,8 @@ class Fleet(object):
     Examples:
         .. code-block:: python
 
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            import paddle.distributed.fleet as fleet
+            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
@@ -58,10 +75,35 @@ class Fleet(object):
     def __init__(self):
         self._runtime_handle = None
         self._util = None
+        self._role_maker = None
+        self._is_collective = False
+
+    def init(self, role_maker=None, is_collective=False):
+        """
+        Initialize role_maker in Fleet.
 
-    def init(self, role_maker):
-        self._role_maker = role_maker
+        This function is responsible for the distributed architecture 
+        what you want to run your code behind,such as Transpiler,
+        Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker 
+        
+        """
+        if isinstance(role_maker, RoleMakerBase):
+            self._role_maker = role_maker
+        elif role_maker == None:
+            if isinstance(is_collective, bool):
+                self._is_collective = is_collective
+                self._role_maker = PaddleCloudRoleMaker(
+                    is_collective=self._is_collective)
+            else:
+                raise ValueError(
+                    "Something wrong occurred, please check whether is_collective is bool value"
+                )
+        else:
+            raise ValueError(
+                "Something wrong occurred, please check whether rolemaker is instance of RoleMakerBase"
+            )
         self.strategy_compiler = StrategyCompiler()
+        return None
 
     def is_first_worker(self):
         """
@@ -182,35 +224,49 @@ class Fleet(object):
         """
         self._role_maker.barrier_worker()
 
+    @inited_runtime_handler
     def init_worker(self):
         """
         init worker
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._init_worker()
 
-    def init_server(self, model_dir=None):
+    @inited_runtime_handler
+    def init_server(self, *args, **kwargs):
         """
         init server
         """
-        assert self._runtime_handle is not None
-        self._runtime_handle._init_server()
+        self._runtime_handle._init_server(*args, **kwargs)
 
+    @inited_runtime_handler
     def run_server(self):
         """
         run server
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._run_server()
 
+    @inited_runtime_handler
     def stop_worker(self):
         """
         stop worker
         """
-        assert self._runtime_handle is not None
         self._runtime_handle._stop_worker()
 
-    def distributed_optimizer(self, optimizer, strategy):
+    def save_inference_model(self,
+                             executor,
+                             dirname,
+                             feeded_var_names,
+                             target_vars,
+                             main_program=None,
+                             export_for_deployment=True):
+        self._runtime_handle._save_inference_model(
+            executor, dirname, feeded_var_names, target_vars, main_program,
+            export_for_deployment)
+
+    def save_persistables(self, executor, dirname, main_program=None):
+        self._runtime_handle._save_persistables(executor, dirname, main_program)
+
+    def distributed_optimizer(self, optimizer, strategy=None):
         """
         distirbuted_optimizer
         Returns:
@@ -218,15 +274,16 @@ class Fleet(object):
 
         Examples:
             .. code-block:: python
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            import paddle.distributed.fleet as fleet
+            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
             optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         """
         self.user_defined_optimizer = optimizer
+        if strategy == None:
+            strategy = DistributedStrategy()
         self.user_defined_strategy = strategy
         self.valid_strategy = None
         return self
@@ -260,8 +317,7 @@ class Fleet(object):
 
         Examples:
             import paddle
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+            import paddle.distributed.fleet as fleet
 
             fc_1 = paddle.layers.fc(input=input_x, size=hid_dim, act='tanh')
             fc_2 = paddlen.layers.fc(input=fc_1, size=hid_dim, act='tanh')
@@ -269,7 +325,7 @@ class Fleet(object):
             cost = paddle.layers.cross_entropy(input=prediction, label=input_y)
             avg_cost = paddle.layers.mean(x=cost)
 
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
@@ -286,11 +342,12 @@ class Fleet(object):
         context["loss"] = loss
         if startup_program == None:
             self.origin_startup_program = \
-                paddle.default_startup_program().clone(for_test=False)
-            startup_program = paddle.default_startup_program()
+                paddle.static.default_startup_program().clone(for_test=False)
+            startup_program = paddle.static.default_startup_program()
         else:
             self.origin_startup_program = \
                 startup_program.clone(for_test=False)
+
         context["origin_startup_program"] = startup_program
         context["role_maker"] = self._role_maker
 
@@ -326,15 +383,23 @@ class Fleet(object):
         context["valid_strategy"] = valid_strategy
 
         self.valid_strategy = valid_strategy
+        self.valid_strategy._enable_env()
 
         optimize_ops = []
         params_grads = []
+
         if meta_optimizer:
             optimize_ops, params_grads = meta_optimizer.minimize(
                 loss,
                 startup_program=startup_program,
                 parameter_list=parameter_list,
                 no_grad_set=no_grad_set)
+
+            default_program = paddle.static.default_main_program()
+
+            if id(default_program) != id(loss.block.program):
+                paddle.fluid.framework.switch_main_program(loss.block.program)
+
         else:
             optimize_ops, params_grads = self.user_defined_optimizer.minimize(
                 loss,
@@ -344,6 +409,7 @@ class Fleet(object):
 
         context["program_optimize_ops"] = optimize_ops
         context["program_params_grads"] = params_grads
+
         if graph_optimizer:
             optimize_ops, params_grads = graph_optimizer.minimize(
                 loss,
diff --git a/python/paddle/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
similarity index 61%
rename from python/paddle/fleet/base/meta_optimizer_factory.py
rename to python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 802f6c4dab7f3a98cc11d9bb1956db5ee33b2746..459070fcc4dbef3711c33b2932e8f1c88647aab5 100755
--- a/python/paddle/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,27 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..meta_optimizers import AMPOptimizer
-from ..meta_optimizers import RecomputeOptimizer
-from ..meta_optimizers import GradientMergeOptimizer
-from ..meta_optimizers import GraphExecutionOptimizer
-from ..meta_optimizers import PipelineOptimizer
-from ..meta_optimizers import LocalSGDOptimizer
-from ..meta_optimizers import LarsOptimizer
-from ..meta_optimizers import DGCOptimizer
-
 __all__ = ["MetaOptimizerFactory"]
 
-meta_optimizer_names = [
-    "AMPOptimizer",
-    "RecomputeOptimizer",
-    "GradientMergeOptimizer",
-    "GraphExecutionOptimizer",
-    "PipelineOptimizer",
-    "LocalSGDOptimizer",
-    "LarsOptimizer",
-    "DGCOptimizer",
-]
+from ..meta_optimizers import *
+
+meta_optimizer_names = list(
+    filter(lambda name: name.endswith("Optimizer"), dir()))
 
 
 class MetaOptimizerFactory(object):
diff --git a/python/paddle/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
similarity index 100%
rename from python/paddle/fleet/base/private_helper_function.py
rename to python/paddle/distributed/fleet/base/private_helper_function.py
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aeeb4a2896ea1d20390e463937aa07d3edd0204
--- /dev/null
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -0,0 +1,565 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defination of Role Makers."""
+import os
+import numpy as np
+from multiprocessing import Process, Manager
+import paddle.fluid as fluid
+
+__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+
+
+class Role:
+    WORKER = 1
+    SERVER = 2
+
+
+class RoleMakerBase(object):
+    """
+    RoleMakerBase is a base class for assigning a role to current process
+    in distributed training.
+    A paddle developer can implement RoleMakerBase to design a role maker
+    for worker or pserver assignment.
+    """
+
+    def __init__(self):
+        self._worker_endpoints = []
+        self._server_endpoints = []
+        self._role_is_generated = False
+        self._role = None
+        self._current_id = -1
+
+        self._node_type = None
+        self._node_type_comm = None
+        self._all_comm = None
+
+    def is_worker(self):
+        """
+        return is_worker() of current process
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def is_server(self):
+        """
+        return is_server() of current process
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def is_first_worker(self):
+        """
+        Check whether the node is the first instance of worker.
+        Returns:
+            bool: True if this is the first node of worker,
+                  False if not.
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def worker_num(self):
+        """
+        Get current total worker number.
+
+        Returns:
+            int: worker number
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def server_num(self):
+        """
+        Get current total server number.
+
+        Returns:
+            int: server number
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def worker_index(self):
+        """
+        Get current worker id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def server_index(self):
+        """
+        Get current server id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def role_id(self):
+        """
+        Get current id.
+
+        Returns:
+            int: node id
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def node_num(self):
+        """
+        Get the training node number
+        Returns:
+            int: node num
+        """
+        raise NotImplementedError("Please implement this method in child class")
+
+    def get_trainer_endpoints(self):
+        """
+        return trainer endpoints
+        """
+        return self._worker_endpoints
+
+    def get_pserver_endpoints(self):
+        """
+        return pserver endpoints
+        """
+        return self._server_endpoints
+
+    def to_string(self):
+        return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
+            self._role, self._current_id, self._worker_endpoints,
+            self._server_endpoints)
+
+    def _all_gather(self, comm_world, input):
+        """
+
+        Args:
+            input(int|float): input value
+
+        Returns:
+            return a list of values
+        """
+        print("warning: RoleMakerBase does not have all gather.")
+        return None
+
+    def _all_reduce(self, comm_world, input, mode="sum"):
+        """
+        Args:
+            input(list/numpy.array): array of one dim
+            output(list/numpy.array): array of one dim
+            mode(str): "sum" or "min" or "max"
+        """
+        print("warning: RoleMakerBase does not have all reduce worker.")
+        return None
+
+    def _barrier(self, comm_world):
+        """
+        barrier between trainers if current role is TRAINER
+        """
+        print("warning: RoleMakerBase does not have barrier worker.")
+
+
+class PaddleCloudRoleMaker(RoleMakerBase):
+    def __init__(self, is_collective=False, **kwargs):
+        super(PaddleCloudRoleMaker, self).__init__()
+        self._is_collective = is_collective
+        self._init_gloo = False  #default no init gloo
+        self._kwargs = kwargs
+
+        self._role_is_generated = False
+
+        self._server_endpoints = None
+        self._worker_endpoints = None
+
+        self._node_type_comm = None
+        self._all_comm = None
+
+        if not self._is_collective:
+            self._hdfs_name = kwargs.get("hdfs_name", "")
+            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
+            self._hdfs_path = kwargs.get("path", "").rstrip("/")
+            self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
+                                                    3600)
+            self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
+                                                   9999999)
+            ip_port = kwargs.get("http_ip_port", "")
+            self._http_ip_port = []
+            self._http_server = None
+            # if ip_port is not empty, it will use http instead of hdfs
+            if ip_port != "":
+                self._http_ip_port = ip_port.split(":")
+                # it's for communication between processes
+                self._manager = Manager()
+                # global dict to store status
+                self._http_server_d = self._manager.dict()
+                # set running status of http server
+                self._http_server_d["running"] = False
+            self._iface = self.__get_default_iface()
+            # this environment variable can be empty
+            self._prefix = os.getenv("SYS_JOB_ID", "")
+
+    def _barrier(self, comm_world):
+        if isinstance(comm_world, fluid.core.Gloo):
+            comm_world.barrier()
+        else:
+            print("warning: must init Gloo before using _barrier() function")
+
+    def _all_gather(self, comm_world, input):
+        if isinstance(comm_world, fluid.core.Gloo):
+            self._barrier(comm_world)
+            output = comm_world.all_gather(input)
+            return output
+        else:
+            print("warning: must init Gloo before using _all_gather() function")
+            return None
+
+    def _all_reduce(self, comm_world, input, mode="sum"):
+        if isinstance(comm_world, fluid.core.Gloo):
+
+            input = np.array(input)
+
+            input_shape = input.shape
+            input_list = input.reshape(-1).tolist()
+
+            self._barrier(comm_world)
+            ans = comm_world.all_reduce(input_list, mode)
+            output = np.array(ans).reshape(input_shape)
+            return output
+        else:
+            print("warning: must init Gloo before using _all_reduce() function")
+            return None
+
+    def is_worker(self):
+        """
+        whether current process is worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.WORKER
+
+    def is_server(self):
+        """
+        whether current process is server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.SERVER
+
+    def is_first_worker(self):
+        """
+        whether current process is worker of rank 0
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.WORKER and self._current_id == 0
+
+    def worker_index(self):
+        """
+        get index of current worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._current_id
+
+    def server_index(self):
+        """
+        get index of current server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._current_id
+
+    def role_id(self):
+        """
+        get index of current node
+        """
+        if self.is_server():
+            return self.server_index()
+        elif self.is_worker():
+            return self.worker_index()
+
+    def worker_num(self):
+        """
+        retrun the current number of worker
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._trainers_num
+
+    def server_num(self):
+        """
+        return the current number of server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._trainers_num
+
+    def node_num(self):
+        """
+        return the training node number
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._node_num
+
+    def get_trainer_endpoints(self):
+        """
+        get endpoint of all trainers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._worker_endpoints
+
+    def get_pserver_endpoints(self):
+        """
+        get endpoint of all pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._server_endpoints
+
+    def _get_rank(self):
+        """
+        get current rank in all workers and pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._rank
+
+    def _get_size(self):
+        """
+        get total num of all workers and pservers
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._size
+
+    def _ps_env(self):
+        try:
+            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
+            # format: string(ip:port), eg. 127.0.0.1:6001
+            self._server_endpoints = os.environ[
+                "PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
+                                               "").split(",")
+
+            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
+            training_role = os.environ["TRAINING_ROLE"]
+
+            if training_role not in ["TRAINER", "PSERVER"]:
+                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+
+            if training_role == "TRAINER":
+                role = Role.WORKER
+                current_id = int(os.environ["PADDLE_TRAINER_ID"])
+                if len(self._worker_endpoints) > 0:
+                    self._cur_endpoint = self._worker_endpoints[current_id]
+            elif training_role == "PSERVER":
+                role = Role.SERVER
+                port = os.environ["PADDLE_PORT"]
+                ip = os.environ["POD_IP"]
+                self._cur_endpoint = ip + ":" + port
+                current_id = self._server_endpoints.index(self._cur_endpoint)
+            else:
+                raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
+        except ValueError as ve:
+            raise ValueError(
+                "something wrong with PaddleCloud, please check environment")
+
+        self._trainers_num = trainers_num
+        self._role = role
+        self._current_id = current_id
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
+
+    def _collective_env(self):
+        self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
+        assert (self._training_role == "TRAINER")
+        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        self._worker_endpoints = self._worker_endpoints.split(",")
+        self._trainers_num = len(self._worker_endpoints)
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
+
+    def _init_gloo_env(self):
+        def init_gloo_instance(role="trainer"):
+            role = role.lower()
+            assert role in ["trainer", "pserver", "all"]
+            if role == "trainer":
+                all_list = self._worker_endpoints
+                rank = self._current_id
+            elif role == "pserver":
+                all_list = self._server_endpoints
+                rank = self._current_id
+            else:
+                all_list = self._worker_endpoints + self._server_endpoints
+                rank = all_list.index(self._cur_endpoint)
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(len(all_list))
+            gloo.set_prefix(self._prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            if len(self._http_ip_port) != 0:
+                gloo.set_http_store(self._http_ip_port[0],
+                                    int(self._http_ip_port[1]), role)
+            else:
+                gloo.set_hdfs_store(self._hdfs_path + "/" + role,
+                                    self._hdfs_name, self._hdfs_ugi)
+            gloo.init()
+            return gloo
+
+        # paddlecloud support gloo
+        if self._role == Role.WORKER:
+            if self._current_id == 0 and len(self._http_ip_port) != 0:
+                size_d = {
+                    "trainer": len(self._worker_endpoints),
+                    "pserver": len(self._server_endpoints),
+                    "all":
+                    len(self._worker_endpoints) + len(self._server_endpoints)
+                }
+                # child process for http server
+                self._http_server = Process(
+                    target=self.__start_kv_server,
+                    args=(self._http_server_d, size_d))
+                self._http_server.daemon = True
+                # set running status to True
+                self._http_server_d["running"] = True
+                # start child process
+                self._http_server.start()
+            self._node_type = 1
+            gloo = init_gloo_instance("trainer")
+            self._node_type_comm = gloo
+        else:
+            assert self._role == Role.SERVER
+            self._node_type = 0
+            gloo = init_gloo_instance("pserver")
+            self._node_type_comm = gloo
+
+        all_list = self._worker_endpoints + self._server_endpoints
+        self._rank = all_list.index(self._cur_endpoint)
+        self._size = len(all_list)
+
+        gloo = init_gloo_instance("all")
+        self._all_comm = gloo
+
+        if self._http_server is not None:
+            # set running status to False
+            self._http_server_d["running"] = False
+            # wait until child process exits
+            self._http_server.join()
+
+    def generate_role(self):
+        """
+        generate role for role maker
+        """
+        if not self._role_is_generated:
+            if not self._is_collective:
+                self._ps_env()
+                if "PADDLE_WITH_GLOO" in os.environ:
+                    self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
+                if self._init_gloo:
+                    self._init_gloo_env()
+            else:
+                self._collective_env()
+            self._role_is_generated = True
+
+    def __get_default_iface(self):
+        """
+        get default physical interface
+        """
+        default1 = self.__get_default_iface_from_gateway()
+        default2 = self.__get_default_iface_from_interfaces()
+        return default2 if default1 == "lo" else default1
+
+    def __get_default_iface_from_gateway(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        gateways = netifaces.gateways()
+        if gateways.get(netifaces.AF_INET) != None:
+            gateway = gateways[netifaces.AF_INET]
+            if len(gateway) > 0 and len(gateway[0]) > 1:
+                return gateway[0][1]
+        return "lo"
+
+    def __get_default_iface_from_interfaces(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        for intf_name in netifaces.interfaces():
+            addresses = netifaces.ifaddresses(intf_name)
+            if netifaces.AF_INET in addresses:
+                ipv4_addresses = addresses[netifaces.AF_INET]
+                for ipv4_address in ipv4_addresses:
+                    if 'broadcast' in ipv4_address:
+                        return intf_name
+        return "lo"
+
+    def __start_kv_server(self, http_server_d, size_d):
+        from paddle.distributed.fleet.utils import KVServer
+        http_server = KVServer(int(self._http_ip_port[1]), size_d)
+        http_server.start()
+        wait_seconds = 5
+        while http_server_d.get("running",
+                                False) and not http_server.shoud_stop():
+            time.sleep(wait_seconds)
+        http_server.stop()
+
+
+class UserDefinedRoleMaker(PaddleCloudRoleMaker):
+    def __init__(self, is_collective=False, init_gloo=False, **kwargs):
+        super(UserDefinedRoleMaker, self).__init__(
+            is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+
+    def _user_defined_ps_env(self):
+        self._server_endpoints = self._kwargs.get("server_endpoints")
+        self._worker_endpoints = self._kwargs.get("worker_endpoints", [])
+        self._trainers_num = self._kwargs.get("worker_num", 0)
+
+        if self._trainers_num == 0:
+            assert (len(self._worker_endpoints) > 0)
+            self._trainers_num = len(self._worker_endpoints)
+
+        self._role = self._kwargs.get("role")
+        self._current_id = self._kwargs.get("current_id")
+
+        if self._role == Role.WORKER and len(
+                self._worker_endpoints) > self._current_id:
+            self._cur_endpoint = self._worker_endpoints[self._current_id]
+        elif self._role == Role.SERVER:
+            self._cur_endpoint = self._server_endpoints[self._current_id]
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
+
+    def _user_defined_collective_env(self):
+        self._worker_endpoints = self._kwargs.get("worker_endpoints")
+        self._current_id = self._kwargs.get("current_id")
+        self._trainers_num = len(self._worker_endpoints)
+        self._training_role = Role.Worker
+        self._node_num = len(
+            set([x.split(':')[0] for x in self._worker_endpoints]))
+
+    def generate_role(self):
+        """
+        generate role for role maker
+        """
+        if not self._role_is_generated:
+            if not self._is_collective:
+                self._user_defined_ps_env()
+                if self._init_gloo:
+                    self._init_gloo_env()
+            else:
+                self._user_defined_collective_env()
+            self._role_is_generated = True
diff --git a/python/paddle/fleet/base/runtime_factory.py b/python/paddle/distributed/fleet/base/runtime_factory.py
similarity index 74%
rename from python/paddle/fleet/base/runtime_factory.py
rename to python/paddle/distributed/fleet/base/runtime_factory.py
index 45dca6dae4e065ba6f2a9f09ac8cf298222b2d15..68d327c2280d01507db8798a80ed19c4eb3a0f4c 100644
--- a/python/paddle/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..runtime.collective_runtime import CollectiveRuntime
+from ..runtime.parameter_server_runtime import ParameterServerRuntime
 
 
 class RuntimeFactory(object):
@@ -23,3 +24,9 @@ class RuntimeFactory(object):
             collective_runtime = CollectiveRuntime()
             collective_runtime._set_basic_info(context)
             return collective_runtime
+
+        k_steps = context["valid_strategy"].a_sync_configs["k_steps"]
+        if not context["role_maker"]._is_collective and k_steps >= 0:
+            ps_runtime = ParameterServerRuntime()
+            ps_runtime._set_basic_info(context)
+            return ps_runtime
diff --git a/python/paddle/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
similarity index 75%
rename from python/paddle/fleet/base/strategy_compiler.py
rename to python/paddle/distributed/fleet/base/strategy_compiler.py
index f0e23713e4f3f98217280f2cbe071bf1e23c823e..4097fc1237f8d7616101810f994c243dffb2cd67 100644
--- a/python/paddle/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -76,6 +76,18 @@ class StrategyCompiler(StrategyCompilerBase):
             opt._disable_strategy(valid_strategy)
         return valid_strategy
 
+    """
+    Meta Optimizer Type A: rewrite forward, backward. e.g. recompute, async, sync, pipeline.
+                           results will be splitted in async, sync, pipeline
+    Meta Optimizer Type B: rewrite forward, 
+                           e.g. AMP and the corresponding backward is generated by rewritten forward
+    Meta Opitmizer Type B: rewrite backward. e.g. gradient fusion
+    Meta Optimizer Type D: rewrite optimize. e.g. lars, lamb, localsgd, gradient merge, dgc
+    Meta Optimizer Type E: only transpile to Graph structure for runtime,
+                           currently, grad fusion and kernel fusion, sync batch-norm included.
+                           we will remove grad fusion and sync batch-norm
+    """
+
     def generate_optimizer(self, loss, role_maker, optimizer,
                            user_defined_strategy, meta_optimizer_list,
                            graph_optimizer_list):
@@ -102,4 +114,18 @@ class StrategyCompiler(StrategyCompilerBase):
                 0]
             return_graph = None if graph_optimizers == None else graph_optimizers[
                 0]
+
+            if meta_optimizers == None or graph_optimizers == None:
+                return return_meta, return_graph
+
+            # do heuristic filter here, if any meta optimizer in graph optimizers is in 
+            # any meta optimizers' black list, set return_graph to None
+            need_graph_opt = True
+            for graph_opt in graph_optimizers:
+                for program_opt in meta_optimizers:
+                    if graph_opt.__class__.__name__ in program_opt.meta_optimizers_black_list:
+                        need_graph_opt = False
+            if not need_graph_opt:
+                return_graph = None
+
             return return_meta, return_graph
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5a6c417c0c45bea819c5832f98b5b6c9fabbd4b
--- /dev/null
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -0,0 +1,442 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fleet Utils."""
+"""distributed operations"""
+"""basic collective operations in python"""
+"""remote file system"""
+
+__all__ = ['UtilBase']
+
+import numpy as np
+import os
+
+import subprocess
+from paddle.fluid import core
+from collections import OrderedDict
+import paddle.fluid as fluid
+from google.protobuf import text_format
+from paddle.fluid import debugger
+from paddle.fluid.framework import Program
+from paddle.fluid.proto import framework_pb2
+from ..utils.fs import FS, LocalFS, HDFSClient
+
+
+class UtilFactory(object):
+    def _create_util(self, context=None):
+        util = UtilBase()
+        if context is not None and "valid_strategy" in context:
+            util._set_strategy(context["valid_strategy"])
+        if context is not None and "role_maker" in context:
+            util._set_role_maker(context["role_maker"])
+        return util
+
+
+class UtilBase(object):
+    def __init__(self):
+        self.role_maker = None
+        self.dist_strategy = None
+
+    def _set_strategy(self, dist_strategy):
+        self.dist_strategy = dist_strategy
+
+    def _set_role_maker(self, role_maker):
+        self.role_maker = role_maker
+
+    def set_file_system(self, fs_client):
+        assert isinstance(
+            fs_client, FS
+        ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
+        self.fs_client = fs_client
+
+    def __check_comm_world(self, comm_world="worker"):
+        if not self.role_maker._role_is_generated:
+            self.role_maker.generate_role()
+
+        _comm_world = None
+        comm_world_upper = comm_world.upper()
+        if comm_world_upper == "WORKER":
+            if not self.role_maker.is_worker():
+                print(
+                    "warning: current role is not worker in collective_func(comm_world=\"worker\")"
+                )
+            _comm_world = self.role_maker._node_type_comm
+        elif comm_world_upper == "SERVER":
+            if not self.role_maker.is_server():
+                print(
+                    "warning: current role is not server in collective_func(comm_world=\"server\")"
+                )
+            _comm_world = self.role_maker._node_type_comm
+        elif comm_world_upper == "ALL":
+            _comm_world = self.role_maker._all_comm
+        else:
+            raise ValueError(
+                "not support comm_world, please choose one from [worker, server, all]"
+            )
+
+        return _comm_world
+
+    def all_reduce(self, input, mode, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        return self.role_maker._all_reduce(_comm_world, input, mode)
+
+    def barrier(self, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        self.role_maker._barrier(_comm_world)
+
+    def all_gather(self, input, comm_world="worker"):
+        _comm_world = self.__check_comm_world(comm_world)
+        return self.role_maker._all_gather(_comm_world, input)
+
+    def broadcast(self):
+        pass
+
+    def scatter(self):
+        pass
+
+    def get_file_shard(self, files):
+        """
+        split files before distributed training,
+        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                   0 gets [a, b, c] and trainer 1 gets [d, e].
+        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                   [a], trainer 1 gets [b],  trainer 2 gets []
+
+        Args:
+            files(list): file list need to be read.
+
+        Returns:
+            list: files belongs to this worker.
+        """
+        if not isinstance(files, list):
+            raise TypeError("files should be a list of file need to be read.")
+
+        trainer_id = self.role_maker.worker_index()
+        trainers = self.role_maker.worker_num()
+
+        remainder = len(files) % trainers
+        blocksize = int(len(files) / trainers)
+
+        blocks = [blocksize] * trainers
+        for i in range(remainder):
+            blocks[i] += 1
+
+        trainer_files = [[]] * trainers
+        begin = 0
+        for i in range(trainers):
+            trainer_files[i] = files[begin:begin + blocks[i]]
+            begin += blocks[i]
+
+        return trainer_files[trainer_id]
+
+    def print_on_rank(self, message, rank_id):
+        if self.role_maker.worker_index() != rank_id:
+            return
+        print(message)
+
+    def _save_program(self, program, model_filename='__model__', is_text=False):
+        if is_text:
+            with open(model_filename, "w") as f:
+                f.write(str(program))
+        else:
+            with open(model_filename, "wb") as f:
+                f.write(program.desc.serialize_to_string())
+
+    def _load_program(self, path, is_text):
+        def load_program_binary(path):
+            """load program from binary string file"""
+            with open(path, "rb") as f:
+                program_desc_str = f.read()
+            return Program.parse_from_string(program_desc_str)
+
+        def load_program_text(path):
+            """load program from human-readable text file"""
+            with open(path, "r") as f:
+                program_desc_text = f.read()
+
+            prog_desc = framework_pb2.ProgramDesc()
+            text_format.Merge(program_desc_text, prog_desc)
+            return Program.parse_from_string(prog_desc.SerializeToString())
+
+        if is_text:
+            return load_program_text(path)
+        else:
+            return load_program_binary(path)
+
+    def _program_type_trans(self, prog_dir, prog_fn, is_text):
+        prog = self._load_program(os.path.join(prog_dir, prog_fn), is_text)
+        prog_out_fn = prog_fn + ".bin" if is_text else prog_fn + ".pbtxt"
+        self._save_program(prog,
+                           os.path.join(prog_dir, prog_out_fn), 1 - is_text)
+        return prog_out_fn
+
+    def _visualize_graphviz(self, program, output_dir, output_filename):
+        block = program.global_block()
+        dot_path = os.path.join(output_dir, output_filename + '.dot')
+        pdf_path = os.path.join(output_dir, output_filename + '.pdf')
+        debugger.draw_block_graphviz(block, path=dot_path)
+        cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
+        p = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        p.wait()
+
+    def _proto_check(self, config):
+        train_prog = self._load_program(config.train_prog_path,
+                                        config.is_text_train_program)
+        pruned_prog = self._load_program(config.pruned_prog_path,
+                                         config.is_text_pruned_program)
+
+        is_match = True
+
+        pruned_vars = [(v.name, v) for v in pruned_prog.list_vars()
+                       if fluid.io.is_persistable(v)]
+        pruned_vars = OrderedDict(pruned_vars)
+        pruned_vars_name = [name for name in pruned_vars]
+        print("persistable vars in pruned program: {}".format(pruned_vars_name))
+
+        # feed and fetch op is added in pruned program when pruning, not need to be found in train program
+        feed_fetch_type_list = [
+            core.VarDesc.VarType.FEED_MINIBATCH, core.VarDesc.VarType.FETCH_LIST
+        ]
+
+        for var_name in pruned_vars:
+            var = pruned_vars[var_name]
+            # feed and fetch op is added in pruned program when pruning, not need to be found in train program
+            if var.type in feed_fetch_type_list:
+                break
+            try:
+                train_prog_var = train_prog.global_block().var(var_name)
+            except ValueError as e:
+                print(
+                    "Not find variable '%s' in train program. please check pruning."
+                    % var_name)
+                is_match = False
+                continue
+            if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype:
+                print(
+                    "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".
+                    format(var_name, var.shape, var.dtype, train_prog_var.shape,
+                           train_prog_var.dtype))
+                is_match = False
+        return is_match
+
+    def _params_check(self, config):
+        def feed_gen(batch_size, feeded_vars_dims, feeded_vars_filelist):
+            def reader(batch_size, fn, dim):
+                data = []
+                if isinstance(dim, list) or isinstance(dim, tuple):
+                    shape = list(dim)
+                    _temp = 1
+                    for x in dim:
+                        _temp = _temp * x
+                    dim = _temp
+                else:
+                    shape = [dim]
+
+                shape = [batch_size] + shape
+                dim = dim * batch_size
+
+                for line in open(fn, 'r'):
+                    fields = line.strip().split(' ')
+                    fields = [float(d) for d in fields]
+                    while len(fields) >= dim:
+                        tmp = fields[:dim]
+                        fields = fields[dim:]
+                        data.append(np.array(tmp).reshape(shape))
+                return data
+
+            batch_feed = []
+            for i, fn in enumerate(feeded_vars_filelist):
+                batch_feed.append(reader(batch_size, fn, feeded_vars_dims[i]))
+            return batch_feed
+
+        prog = self._load_program(
+            os.path.join(config.dump_model_dir, config.dump_program_filename),
+            config.is_text_dump_program)
+        if config.is_text_dump_program:
+            model_filename = self._program_type_trans(
+                config.dump_model_dir, config.dump_program_filename,
+                config.is_text_dump_program)
+
+        saved_params = [
+            v for v in prog.list_vars() if fluid.io.is_persistable(v)
+        ]
+        print("persistable vars in dump program: {}".format(
+            [v.name for v in saved_params]))
+
+        def check_not_expected_ops(prog, not_expected_op_types):
+            op_types_set = set()
+            for op in prog.global_block().ops:
+                if op.type in not_expected_op_types and op.type not in op_types_set:
+                    op_types_set.add(op.type)
+            return op_types_set
+
+        not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"])
+        if len(not_expected_op_types) > 0:
+            print(
+                "find op type '{}' in program, please check if your program is pruned correctly !".
+                format(list(not_expected_op_types)))
+            return False
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            inference_program, feed_target_names, fetch_targets = \
+                fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
+                                            params_filename=config.save_params_filename)
+
+            # check program vars and saved vars shape
+            orig_para_shape = {
+                each_var.name: tuple(each_var.desc.shape())
+                for each_var in saved_params
+            }
+            for each_var in saved_params:
+                var_temp = fluid.global_scope().find_var(each_var.name)
+                assert var_temp != None, "can't not find var: " + each_var.name
+                new_shape = (np.array(var_temp.get_tensor())).shape
+                assert each_var.name in orig_para_shape, each_var.name + "MUST in var list"
+                orig_shape = orig_para_shape.get(each_var.name)
+                if new_shape != orig_shape:
+                    raise RuntimeError(
+                        "Shape not matching: the Program requires a parameter with a shape of ({}), "
+                        "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".
+                        format(orig_shape, each_var.name, new_shape))
+
+            # check feed/fetch vars in program and config
+            feed_config = config.feed_config
+            fetch_config = config.fetch_config
+            fetch_targets_names = [v.name for v in fetch_targets]
+            if not feed_target_names:
+                print("warning! no feed targets in program.")
+            if not fetch_targets_names:
+                print("warning! no fetch targets in program.")
+            fetch_list = fetch_targets
+            feed_name_list = feed_target_names
+            if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names:
+                print(
+                    "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.".
+                    format(feed_target_names, feed_config.feeded_vars_names))
+                feed_name_list = feed_config.feeded_vars_names
+                # remove feed op in inference_program. new feed op will be added in exe.run
+                global_block = inference_program.global_block()
+                need_to_remove_op_index = []
+                for i, op in enumerate(global_block.ops):
+                    op.desc.set_is_target(False)
+                    if op.type == "feed":  # only remove feed op here
+                        need_to_remove_op_index.append(i)
+                for index in need_to_remove_op_index[::-1]:
+                    global_block._remove_op(index)
+            if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names:
+                print(
+                    "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".
+                    format(fetch_targets_names, fetch_config.fetch_vars_names))
+                fetch_list = [
+                    inference_program.global_block().var(i)
+                    for i in fetch_config.fetch_vars_names
+                ]
+                # remove fetch op in inference_program. new fetch op will be added in exe.run
+                global_block = inference_program.global_block()
+                need_to_remove_op_index = []
+                for i, op in enumerate(global_block.ops):
+                    op.desc.set_is_target(False)
+                    if op.type == "fetch":  # only remove fetch op here
+                        need_to_remove_op_index.append(i)
+                for index in need_to_remove_op_index[::-1]:
+                    global_block._remove_op(index)
+
+            # if fetch_list have lod tensor
+            return_numpy = all([v.lod_level == 0 for v in fetch_list])
+
+            # try dump fetch_targets
+            feed_tensors = []
+            assert len(feed_config.feeded_vars_names) == len(
+                feed_config.feeded_vars_dims) == len(
+                    feed_config.feeded_vars_types)
+            # check program vars and feed tensor shape in config
+            for i in range(len(feed_config.feeded_vars_names)):
+                var = inference_program.global_block().var(
+                    feed_config.feeded_vars_names[i])
+                if not isinstance(feed_config.feeded_vars_dims[i],
+                                  (list, tuple)):
+                    tensor_shape = (feed_config.feeded_vars_dims[i], )
+                else:
+                    tensor_shape = tuple(feed_config.feeded_vars_dims[i])
+                feed_config.feeded_vars_dims[i] = tensor_shape
+                var_shape = var.shape[1:]
+                if tensor_shape != var_shape:
+                    raise RuntimeError(
+                        "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".
+                        format(feed_config.feeded_vars_names[i], var_shape,
+                               tensor_shape))
+
+            if not feed_config.feeded_vars_filelist:
+                print("generate random feed vars.")
+                for i in range(len(feed_config.feeded_vars_names)):
+                    var = inference_program.global_block().var(
+                        feed_config.feeded_vars_names[i])
+                    # create fake feed tensor. if lod_level > 1, should create_lod_tensor()
+                    if var.lod_level == 0:
+                        feed_tensors.append(
+                            np.array(
+                                np.random.random(
+                                    tuple([config.batch_size] + list(
+                                        feed_config.feeded_vars_dims[i]))),
+                                dtype=feed_config.feeded_vars_types[i]))
+                    elif var.lod_level == 1:
+                        t = np.array(
+                            np.random.random(
+                                tuple([config.batch_size] + list(
+                                    feed_config.feeded_vars_dims[i]))),
+                            dtype=feed_config.feeded_vars_types[i])
+                        feed_tensors.append(
+                            fluid.create_lod_tensor(t, [[1] * config.batch_size
+                                                        ], place))
+                    else:
+                        raise RuntimeError(
+                            "vars with lod_level >= 2 is not supported now in this infer program check tool."
+                        )
+                results = exe.run(inference_program,
+                                  feed={
+                                      name: feed_tensors[i]
+                                      for i, name in enumerate(feed_name_list)
+                                  },
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
+            else:
+                print("load feed vars from files: {}.".format(
+                    feed_config.feeded_vars_filelist))
+                feed_vars = [
+                    inference_program.global_block().var(
+                        feed_config.feeded_vars_names[i])
+                    for i in range(len(feed_config.feeded_vars_names))
+                ]
+                feeder = fluid.DataFeeder(feed_list=feed_vars, place=place)
+                batch_feed = feed_gen(config.batch_size,
+                                      feed_config.feeded_vars_dims,
+                                      feed_config.feeded_vars_filelist)
+                slots = [batch_feed]
+                results = exe.run(inference_program,
+                                  feed=feeder.feed(slots),
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)
+            for i, v in enumerate(fetch_list):
+                print("fetch_targets name: %s" % v.name)
+                print("fetch_targets: {}".format(results[i]))
+            return results
+
+
+fleet_util = UtilFactory()._create_util(None)
diff --git a/python/paddle/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
similarity index 97%
rename from python/paddle/fleet/cloud_utils.py
rename to python/paddle/distributed/fleet/cloud_utils.py
index 72c306fe3b91531b6f7f39134bf4abd86c686dee..49d66118d902e43f7ee0c4003c516081092b2a97 100644
--- a/python/paddle/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -14,7 +14,7 @@
 
 import os
 import paddle
-from paddle.fleet.launch_utils import get_cluster, logger
+from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
 
 def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
diff --git a/python/paddle/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
similarity index 96%
rename from python/paddle/fleet/dataset/__init__.py
rename to python/paddle/distributed/fleet/dataset/__init__.py
index 8647330f3290f3142cabca9a7e3fe162a9838dda..af33c4eafb396827335157933d51f37ca8b06011 100644
--- a/python/paddle/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
@@ -10,3 +10,5 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
+
+from .dataset import *
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6504cacd9680806a13b4bb815247124b7e6a23c
--- /dev/null
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -0,0 +1,1103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is definition of dataset class, which is high performance IO."""
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+import paddle.fluid.core as core
+
+
+class DatasetFactory(object):
+    """
+    DatasetFactory is a factory which create dataset by its name,
+    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
+    the default is "QueueDataset".
+
+    Example:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+
+    """
+
+    def __init__(self):
+        """ Init. """
+        pass
+
+    def create_dataset(self, datafeed_class="QueueDataset"):
+        """
+        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
+        the default is "QueueDataset".
+
+        Args:
+            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
+                                 Default is QueueDataset.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+
+        """
+        try:
+            dataset = globals()[datafeed_class]()
+            return dataset
+        except:
+            raise ValueError("datafeed class %s does not exist" %
+                             datafeed_class)
+
+
+class DatasetBase(object):
+    """ Base dataset class. """
+
+    def __init__(self):
+        """ Init. """
+        # define class name here
+        # to decide whether we need create in memory instance
+        self.proto_desc = data_feed_pb2.DataFeedDesc()
+        self.proto_desc.pipe_command = "cat"
+        self.dataset = core.Dataset("MultiSlotDataset")
+        self.thread_num = 1
+        self.filelist = []
+
+    def set_pipe_command(self, pipe_command):
+        """
+        Set pipe command of current dataset
+        A pipe command is a UNIX pipeline command that can be used only
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_pipe_command("python my_script.py")
+
+        Args:
+            pipe_command(str): pipe command
+
+        """
+        self.proto_desc.pipe_command = pipe_command
+
+    def set_rank_offset(self, rank_offset):
+        """
+        Set rank_offset for merge_pv. It set the message of Pv.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_rank_offset("rank_offset")
+
+        Args:
+            rank_offset(str): rank_offset's name
+
+        """
+        self.proto_desc.rank_offset = rank_offset
+
+    def set_fea_eval(self, record_candidate_size, fea_eval=True):
+        """
+        set fea eval mode for slots shuffle to debug the importance level of
+        slots(features), fea_eval need to be set True for slots shuffle.
+        
+        Args:
+            record_candidate_size(int): size of instances candidate to shuffle 
+                                        one slot
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
+                            default is True.
+            
+        Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset.set_fea_eval(1000000, True)
+
+        """
+        if fea_eval:
+            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
+        self.fea_eval = fea_eval
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        if self.fea_eval:
+            slots_set = set(slots)
+            self.dataset.slots_shuffle(slots_set)
+
+    def set_batch_size(self, batch_size):
+        """
+        Set batch size. Will be effective during training
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_batch_size(128)
+
+        Args:
+            batch_size(int): batch size
+
+        """
+        self.proto_desc.batch_size = batch_size
+
+    def set_pv_batch_size(self, pv_batch_size):
+        """
+        Set pv batch size. It will be effective during enable_pv_merge
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_pv_batch(128)
+        Args:
+            pv_batch_size(int): pv batch size
+
+        """
+        self.proto_desc.pv_batch_size = pv_batch_size
+
+    def set_thread(self, thread_num):
+        """
+        Set thread num, it is the num of readers.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+               dataset.set_thread(12)
+
+        Args:
+            thread_num(int): thread num
+        """
+        self.dataset.set_thread_num(thread_num)
+        self.thread_num = thread_num
+
+    def set_filelist(self, filelist):
+        """
+        Set file list in current worker.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_filelist(['a.txt', 'b.txt'])
+
+        Args:
+            filelist(list): file list
+        """
+        self.dataset.set_filelist(filelist)
+        self.filelist = filelist
+
+    def set_input_type(self, input_type):
+        self.proto_desc.input_type = input_type
+
+    def set_use_var(self, var_list):
+        """
+        Set Variables which you will use.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_use_var([data, label])
+
+        Args:
+            var_list(list): variable list
+        """
+        multi_slot = self.proto_desc.multi_slot_desc
+        for var in var_list:
+            slot_var = multi_slot.slots.add()
+            slot_var.is_used = True
+            slot_var.name = var.name
+            if var.lod_level == 0:
+                slot_var.is_dense = True
+                slot_var.shape.extend(var.shape)
+            if var.dtype == core.VarDesc.VarType.FP32:
+                slot_var.type = "float"
+            elif var.dtype == core.VarDesc.VarType.INT64:
+                slot_var.type = "uint64"
+            else:
+                raise ValueError(
+                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                )
+
+    def set_hdfs_config(self, fs_name, fs_ugi):
+        """
+        Set hdfs config: fs name ad ugi
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+
+        Args:
+            fs_name(str): fs name
+            fs_ugi(str): fs ugi
+        """
+        self.dataset.set_hdfs_config(fs_name, fs_ugi)
+
+    def set_download_cmd(self, download_cmd):
+        """
+        Set customized download cmd: download_cmd
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              dataset.set_download_cmd("./read_from_afs")
+
+        Args:
+            download_cmd(str): customized download command
+        """
+        self.dataset.set_download_cmd(download_cmd)
+
+    def _prepare_to_run(self):
+        """
+        Set data_feed_desc before load or shuffle,
+        user no need to call this function.
+        """
+        if self.thread_num > len(self.filelist):
+            self.thread_num = len(self.filelist)
+        self.dataset.set_thread_num(self.thread_num)
+        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.create_readers()
+
+    def _finish_to_run(self):
+        self.dataset.destroy_readers()
+
+    def desc(self):
+        """
+        Returns a protobuf message for this DataFeedDesc
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset()
+              print(dataset.desc())
+
+        Returns:
+            A string message
+        """
+        return text_format.MessageToString(self.proto_desc)
+
+    def _dynamic_adjust_before_train(self, thread_num):
+        pass
+
+    def _dynamic_adjust_after_train(self):
+        pass
+
+
+class InMemoryDataset(DatasetBase):
+    """
+    InMemoryDataset, it will load data into memory
+    and shuffle data before training.
+    This class should be created by DatasetFactory
+
+    Example:
+        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
+    """
+
+    def __init__(self):
+        """ Init. """
+        super(InMemoryDataset, self).__init__()
+        self.proto_desc.name = "MultiSlotInMemoryDataFeed"
+        self.fleet_send_batch_size = None
+        self.is_user_set_queue_num = False
+        self.queue_num = None
+        self.parse_ins_id = False
+        self.parse_content = False
+        self.parse_logkey = False
+        self.merge_by_sid = True
+        self.enable_pv_merge = False
+        self.merge_by_lineid = False
+        self.fleet_send_sleep_seconds = None
+
+    def set_feed_type(self, data_feed_type):
+        """
+        Set data_feed_desc
+        """
+        self.proto_desc.name = data_feed_type
+
+    def _prepare_to_run(self):
+        """
+        Set data_feed_desc before load or shuffle,
+        user no need to call this function.
+        """
+        if self.thread_num <= 0:
+            self.thread_num = 1
+        self.dataset.set_thread_num(self.thread_num)
+        if self.queue_num is None:
+            self.queue_num = self.thread_num
+        self.dataset.set_queue_num(self.queue_num)
+        self.dataset.set_parse_ins_id(self.parse_ins_id)
+        self.dataset.set_parse_content(self.parse_content)
+        self.dataset.set_parse_logkey(self.parse_logkey)
+        self.dataset.set_merge_by_sid(self.merge_by_sid)
+        self.dataset.set_enable_pv_merge(self.enable_pv_merge)
+        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.create_channel()
+        self.dataset.create_readers()
+
+    def _dynamic_adjust_before_train(self, thread_num):
+        if not self.is_user_set_queue_num:
+            self.dataset.dynamic_adjust_channel_num(thread_num, False)
+        self.dataset.dynamic_adjust_readers_num(thread_num)
+
+    def _dynamic_adjust_after_train(self):
+        if not self.is_user_set_queue_num:
+            self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
+        self.dataset.dynamic_adjust_readers_num(self.thread_num)
+
+    def set_queue_num(self, queue_num):
+        """
+        Set Dataset output queue num, training threads get data from queues
+
+        Args:
+            queue_num(int): dataset output queue num
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_queue_num(12)
+
+        """
+        self.is_user_set_queue_num = True
+        self.queue_num = queue_num
+
+    def set_parse_ins_id(self, parse_ins_id):
+        """
+        Set id Dataset need to parse insid
+
+        Args:
+            parse_ins_id(bool): if parse ins_id or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_parse_ins_id(True)
+
+        """
+        self.parse_ins_id = parse_ins_id
+
+    def set_parse_content(self, parse_content):
+        """
+        Set if Dataset need to parse content
+
+        Args:
+            parse_content(bool): if parse content or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_parse_content(True)
+
+        """
+        self.parse_content = parse_content
+
+    def set_parse_logkey(self, parse_logkey):
+        """
+        Set if Dataset need to parse logkey
+
+        Args:
+            parse_content(bool): if parse logkey or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_parse_logkey(True)
+
+        """
+        self.parse_logkey = parse_logkey
+
+    def set_merge_by_sid(self, merge_by_sid):
+        """
+        Set if Dataset need to merge sid. If not, one ins means one Pv.
+
+        Args:
+            merge_by_sid(bool): if merge sid or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_merge_by_sid(True)
+
+        """
+        self.merge_by_sid = merge_by_sid
+
+    def set_enable_pv_merge(self, enable_pv_merge):
+        """
+        Set if Dataset need to merge pv.
+
+        Args:
+            enable_pv_merge(bool): if enable_pv_merge or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_enable_pv_merge(True)
+
+        """
+        self.enable_pv_merge = enable_pv_merge
+
+    def preprocess_instance(self):
+        """
+        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        It will be effective when enable_pv_merge_ is True.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+
+        """
+        self.dataset.preprocess_instance()
+
+    def set_current_phase(self, current_phase):
+        """
+        Set current phase in train. It is useful for untest.
+        current_phase : 1 for join, 0 for update.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.set_current_phase(1)
+
+        """
+        self.dataset.set_current_phase(current_phase)
+
+    def postprocess_instance(self):
+        """
+        Divide pv instance and convey it to input_channel.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+              exe.train_from_dataset(dataset)
+              dataset.postprocess_instance()
+
+        """
+        self.dataset.postprocess_instance()
+
+    def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
+        """
+        Set fleet send batch size, default is 1024
+
+        Args:
+            fleet_send_batch_size(int): fleet send batch size
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_fleet_send_batch_size(800)
+
+        """
+        self.fleet_send_batch_size = fleet_send_batch_size
+
+    def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
+        """
+        Set fleet send sleep time, default is 0
+
+        Args:
+            fleet_send_sleep_seconds(int): fleet send sleep time
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_fleet_send_sleep_seconds(2)
+
+        """
+        self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
+
+    def set_merge_by_lineid(self, merge_size=2):
+        """
+        Set merge by line id, instances of same line id will be merged after
+        shuffle, you should parse line id in data generator.
+
+        Args:
+            merge_size(int): ins size to merge. default is 2.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset.set_merge_by_lineid()
+
+        """
+        self.dataset.set_merge_by_lineid(merge_size)
+        self.merge_by_lineid = True
+        self.parse_ins_id = True
+
+    def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
+        self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
+        self.gen_uni_feasigns = generate_uni_feasigns
+        self.local_shard_num = shard_num
+
+    def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
+                                     consume_thread_num, shard_num):
+        self.dataset.generate_local_tables_unlock(
+            table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
+
+    def load_into_memory(self):
+        """
+        Load data into memory
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+        """
+        self._prepare_to_run()
+        self.dataset.load_into_memory()
+
+    def preload_into_memory(self, thread_num=None):
+        """
+        Load data into memory in async mode
+
+        Args:
+            thread_num(int): preload thread num
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+              dataset.wait_preload_done()
+        """
+        self._prepare_to_run()
+        if thread_num is None:
+            thread_num = self.thread_num
+        self.dataset.set_preload_thread_num(thread_num)
+        self.dataset.create_preload_readers()
+        self.dataset.preload_into_memory()
+
+    def wait_preload_done(self):
+        """
+        Wait preload_into_memory done
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+              dataset.wait_preload_done()
+        """
+        self.dataset.wait_preload_done()
+        self.dataset.destroy_preload_readers()
+
+    def local_shuffle(self):
+        """
+        Local shuffle
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.local_shuffle()
+        """
+        self.dataset.local_shuffle()
+
+    def global_shuffle(self, fleet=None, thread_num=12):
+        """
+        Global shuffle.
+        Global shuffle can be used only in distributed mode. i.e. multiple
+        processes on single machine or multiple machines training together.
+        If you run in distributed mode, you should pass fleet instead of None.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.global_shuffle(fleet)
+
+        Args:
+            fleet(Fleet): fleet singleton. Default None.
+            thread_num(int): shuffle thread num. Default is 12.
+
+        """
+        trainer_num = 1
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+            trainer_num = fleet.worker_num()
+        if self.fleet_send_batch_size is None:
+            self.fleet_send_batch_size = 1024
+        if self.fleet_send_sleep_seconds is None:
+            self.fleet_send_sleep_seconds = 0
+        self.dataset.register_client2client_msg_handler()
+        self.dataset.set_trainer_num(trainer_num)
+        self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
+        self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+        self.dataset.global_shuffle(thread_num)
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+        if self.merge_by_lineid:
+            self.dataset.merge_by_lineid()
+        if fleet is not None:
+            fleet._role_maker.barrier_worker()
+
+    def release_memory(self):
+        """
+        :api_attr: Static Graph
+        
+        Release InMemoryDataset memory data, when data will not be used again.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.global_shuffle(fleet)
+              exe = fluid.Executor(fluid.CPUPlace())
+              exe.run(fluid.default_startup_program())
+              exe.train_from_dataset(fluid.default_main_program(), dataset)
+              dataset.release_memory()
+
+        """
+        self.dataset.release_memory()
+
+    def get_pv_data_size(self):
+        """
+        Get memory data size of Pv, user can call this function to know the pv num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Returns:
+            The size of memory pv data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_pv_data_size()
+
+        """
+        return self.dataset.get_pv_data_size()
+
+    def get_memory_data_size(self, fleet=None):
+        """
+        Get memory data size, user can call this function to know the num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Args:
+            fleet(Fleet): Fleet Object.
+
+        Returns:
+            The size of memory data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_memory_data_size(fleet)
+
+        """
+        import numpy as np
+        local_data_size = self.dataset.get_memory_data_size()
+        local_data_size = np.array([local_data_size])
+        if fleet is not None:
+            global_data_size = local_data_size * 0
+            fleet._role_maker.all_reduce_worker(local_data_size,
+                                                global_data_size)
+            return global_data_size[0]
+        return local_data_size[0]
+
+    def get_shuffle_data_size(self, fleet=None):
+        """
+        Get shuffle data size, user can call this function to know the num
+        of ins in all workers after local/global shuffle.
+
+        Note:
+            This function may cause bad performance to local shuffle,
+            because it has barrier. It does not affect global shuffle.
+
+        Args:
+            fleet(Fleet): Fleet Object.
+
+        Returns:
+            The size of shuffle data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.global_shuffle(fleet)
+              print dataset.get_shuffle_data_size(fleet)
+
+        """
+        import numpy as np
+        local_data_size = self.dataset.get_shuffle_data_size()
+        local_data_size = np.array([local_data_size])
+        if fleet is not None:
+            global_data_size = local_data_size * 0
+            fleet._role_maker.all_reduce_worker(local_data_size,
+                                                global_data_size)
+            return global_data_size[0]
+        return local_data_size[0]
+
+
+class QueueDataset(DatasetBase):
+    """
+    QueueDataset, it will process data streamly.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+
+    """
+
+    def __init__(self):
+        """
+        Initialize QueueDataset
+        This class should be created by DatasetFactory
+        """
+        super(QueueDataset, self).__init__()
+        self.proto_desc.name = "MultiSlotDataFeed"
+
+    def _prepare_to_run(self):
+        """
+        Set data_feed_desc/thread num/filelist before run,
+        user no need to call this function.
+        """
+        if self.thread_num > len(self.filelist):
+            self.thread_num = len(self.filelist)
+        if self.thread_num == 0:
+            self.thread_num = 1
+        self.dataset.set_thread_num(self.thread_num)
+        self.dataset.set_filelist(self.filelist)
+        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.create_readers()
+
+    def local_shuffle(self):
+        """
+        Local shuffle data.
+
+        Local shuffle is not supported in QueueDataset
+        NotImplementedError will be raised
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+              dataset.local_shuffle()
+
+        Raises:
+            NotImplementedError: QueueDataset does not support local shuffle
+
+        """
+        raise NotImplementedError(
+            "QueueDataset does not support local shuffle, "
+            "please use InMemoryDataset for local_shuffle")
+
+    def global_shuffle(self, fleet=None):
+        """
+        Global shuffle data.
+
+        Global shuffle is not supported in QueueDataset
+        NotImplementedError will be raised
+
+        Args:
+            fleet(Fleet): fleet singleton. Default None.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+              dataset.global_shuffle(fleet)
+
+        Raises:
+            NotImplementedError: QueueDataset does not support global shuffle
+
+        """
+        raise NotImplementedError(
+            "QueueDataset does not support global shuffle, "
+            "please use InMemoryDataset for global_shuffle")
+
+
+class FileInstantDataset(DatasetBase):
+    """
+    FileInstantDataset, it will process data streamly.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
+    """
+
+    def __init__(self):
+        """
+        Initialize FileInstantDataset
+        This class should be created by DatasetFactory
+        """
+        super(FileInstantDataset, self).__init__()
+        self.proto_desc.name = "MultiSlotFileInstantDataFeed"
+
+    def local_shuffle(self):
+        """
+        Local shuffle
+        FileInstantDataset does not support local shuffle
+        """
+        raise NotImplementedError(
+            "FileInstantDataset does not support local shuffle, "
+            "please use InMemoryDataset for local_shuffle")
+
+    def global_shuffle(self, fleet=None):
+        """
+        Global shuffle
+        FileInstantDataset does not support global shuffle
+        """
+        raise NotImplementedError(
+            "FileInstantDataset does not support global shuffle, "
+            "please use InMemoryDataset for global_shuffle")
+
+
+class BoxPSDataset(InMemoryDataset):
+    """
+    BoxPSDataset: derived from InMemoryDataset.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+    """
+
+    def __init__(self):
+        """
+        Initialize BoxPSDataset
+        This class should be created by DatasetFactory
+        """
+        super(BoxPSDataset, self).__init__()
+        self.boxps = core.BoxPS(self.dataset)
+        self.proto_desc.name = "PaddleBoxDataFeed"
+
+    def set_date(self, date):
+        """
+        Workaround for date
+        """
+        year = int(date[:4])
+        month = int(date[4:6])
+        day = int(date[6:])
+        self.boxps.set_date(year, month, day)
+
+    def begin_pass(self):
+        """
+        Begin Pass
+        Notify BoxPS to load sparse parameters of next pass to GPU Memory 
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              dataset.begin_pass()
+        """
+        self.boxps.begin_pass()
+
+    def end_pass(self, need_save_delta):
+        """
+        End Pass
+        Notify BoxPS that current pass ended 
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              dataset.end_pass(True)
+        """
+        self.boxps.end_pass(need_save_delta)
+
+    def wait_preload_done(self):
+        """
+        Wait async preload done
+        Wait Until Feed Pass Done
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+              dataset.wait_preload_done()
+        """
+        self.boxps.wait_feed_pass_done()
+
+    def load_into_memory(self):
+        """
+        Load next pass into memory and notify boxps to fetch its emb from SSD
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+	    """
+        self._prepare_to_run()
+        self.boxps.load_into_memory()
+
+    def preload_into_memory(self):
+        """
+        Begin async preload next pass while current pass may be training
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.preload_into_memory()
+        """
+        self._prepare_to_run()
+        self.boxps.preload_into_memory()
+
+    def _dynamic_adjust_before_train(self, thread_num):
+        if not self.is_user_set_queue_num:
+            self.dataset.dynamic_adjust_channel_num(thread_num, True)
+        self.dataset.dynamic_adjust_readers_num(thread_num)
+
+    def _dynamic_adjust_after_train(self):
+        pass
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        slots_set = set(slots)
+        self.boxps.slots_shuffle(slots_set)
diff --git a/python/paddle/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
similarity index 60%
rename from python/paddle/fleet/launch.py
rename to python/paddle/distributed/fleet/launch.py
index de5e0b66b3e41818875f84e4ba5dd0557bfdb02f..29a1bda92f17443e6c38b070379481aaa419b1d4 100644
--- a/python/paddle/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-paddle.distributed.launch is a module that spawns multiple distributed 
+fleetrun is a module that spawns multiple distributed
 process on each training node for gpu training and cpu training.
 Usage:
-    In both of single node training or multiple node training, this module 
+    In both of single node training or multiple node training, this module
 launch a process on each of the given gpu card or cpu machine.
     GPU training:
     1. for single node training with all visible gpu cards:
@@ -24,24 +24,33 @@ launch a process on each of the given gpu card or cpu machine.
        fleetrun --gpus="0,1,2,3" your_training_py (arg1 arg2 and all others)
     3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
         on 192.168.0.16:
-            fleetrun --ips="192.168.0.16,192.168.0.17" --node_ip=192.168.0.16 \
+            fleetrun --ips="192.168.0.16,192.168.0.17" \
                 your_training_py (arg1 arg2 and all others)
         on 192.168.0.17:
             fleetrun --ips="192.168.0.16,192.168.0.17" \
-                --node_ip=192.168.0.17 \
                 your_training_py (arg1 arg2 and all others)
     CPU training:
     1. for single node training with multi servers and workers:
-        fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others)
+        fleetrun --server_num=2 --worker_num=2 your_training_py (arg1 arg2 and all others)
     2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
-        with 2 servers and  4 workers.
+        with 2 servers and 4 workers.
         on 192.168.0.16:
-            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
-                --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
+            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
+                --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
                 your_training_py (arg1 arg2 and all others)
         on 192.168.0.17:
             fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
-                --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
+                --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
+                your_training_py (arg1 arg2 and all others)
+    3. use gloo backend for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
+        with 2 servers and 4 workers. (workers should set port)
+        on 192.168.0.16:
+            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
+                --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
+                your_training_py (arg1 arg2 and all others)
+        on 192.168.0.17:
+            fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
+                --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
                 your_training_py (arg1 arg2 and all others)
 """
 
@@ -57,8 +66,8 @@ from argparse import ArgumentParser, REMAINDER
 import paddle
 import paddle.fluid as fluid
 
-from paddle.fleet.launch_utils import *
-import paddle.fleet.cloud_utils as cloud_utils
+from paddle.distributed.fleet.launch_utils import *
+import paddle.distributed.fleet.cloud_utils as cloud_utils
 
 
 def _print_arguments(args):
@@ -96,15 +105,14 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "--servers", type=str, default="", help="User defined servers ip:port")
     parser.add_argument(
         "--workers", type=str, default="", help="User defined workers ip:port")
-    parser.add_argument(
-        "--worker_num", type=int, default=2, help="number of workers")
+    parser.add_argument("--worker_num", type=int, help="number of workers")
 
-    parser.add_argument(
-        "--server_num", type=int, default=2, help="number of servers")
+    parser.add_argument("--server_num", type=int, help="number of servers")
 
     parser.add_argument(
         "--log_dir",
         type=str,
+        default="log",
         help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
     #positional
@@ -129,11 +137,11 @@ def get_cluster_from_args(args, gpus):
         _, node_ip = get_host_name_ip()
 
     # node_ip = args.node_ip
-    assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips:{%s}" \
+    assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
                 % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
-    logger.debug("parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
+    logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
         node_ips, node_ip, node_rank))
 
     free_ports = None
@@ -187,8 +195,11 @@ def launch_collective(args):
     cluster = None
     pod = None
 
+    start_port = 6170
+    if os.environ.get('FLAGS_START_PORT') is not None:
+        start_port = os.environ.get('FLAGS_START_PORT')
     if cloud_utils.use_paddlecloud() and trainers_num != 1:
-        cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus)
+        cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port)
         logger.info("get cluster from cloud:{}".format(cluster))
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
@@ -213,11 +224,87 @@ def launch_collective(args):
 
 
 def launch_ps(args):
-    worker_num = args.worker_num
-    server_num = args.server_num
+    ports = None
     start_port = 6170
-    if os.environ.get('FLAGS_START_PORT') is not None:
-        start_port = os.environ.get('FLAGS_START_PORT')
+    if args.server_num:
+        server_num = args.server_num
+        ports = get_ports(server_num, 0)
+        server_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
+    else:
+        assert args.servers != "", "The setting of CPU mode must be either server_num or servers."
+        server_endpoints = args.servers
+    server_endpoints_ips = [
+        x.strip().split(":")[0] for x in server_endpoints.split(",")
+    ]
+    server_endpoints_port = [
+        x.strip().split(":")[1] for x in server_endpoints.split(",")
+    ]
+    server_num = len(server_endpoints_ips)
+
+    if args.worker_num:
+        worker_num = args.worker_num
+        ports = get_ports(worker_num, server_num)
+        worker_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
+    else:
+        assert args.workers != "", "The setting of CPU mode must be either worker_num or workers."
+        worker_endpoints = args.workers
+    worker_endpoints_ips = [
+        x.strip().split(":")[0] for x in worker_endpoints.split(",")
+    ]
+    worker_num = len(worker_endpoints_ips)
+    node_ips = list(set(server_endpoints_ips + worker_endpoints_ips))
+    worker_endpoints_len = [
+        len(x.strip().split(":")) for x in worker_endpoints.split(",")
+    ]
+    if 1 in worker_endpoints_len:
+        # if no port value in worker_endpoints, will set default port values.
+        worker_endpoints_port = range(start_port + server_num,
+                                      start_port + server_num + worker_num, 1)
+    else:
+        worker_endpoints_port = [
+            x.strip().split(":")[1] for x in worker_endpoints.split(",")
+        ]
+
+    # local train
+    if len(set(node_ips)) == 1:
+        current_node_ip = node_ips[0]
+    else:
+        _, current_node_ip = get_host_name_ip()
+
+    assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
+                % (current_node_ip, node_ips)
+    node_rank = node_ips.index(current_node_ip)
+    logger.debug(
+        "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
+        format(node_ips, current_node_ip, node_rank, server_endpoints_port))
+
+    cluster = Cluster(hdfs=None)
+    server_rank = 0
+    worker_rank = 0
+    for node_rank, ip in enumerate(node_ips):
+        pod = Pod()
+        pod.rank = node_rank
+        pod.addr = ip
+        for i in range(len(server_endpoints_ips)):
+            if ip == server_endpoints_ips[i]:
+                server = Trainer()
+                server.endpoint = "%s:%s" % (ip, server_endpoints_port[i])
+                server.rank = server_rank
+                server_rank += 1
+                pod.servers.append(server)
+        for j in range(len(worker_endpoints_ips)):
+            if ip == worker_endpoints_ips[j]:
+                worker = Trainer()
+                worker.endpoint = "%s:%s" % (ip, worker_endpoints_port[i])
+                worker.rank = worker_rank
+                worker_rank += 1
+                pod.workers.append(worker)
+
+        cluster.pods.append(pod)
+
+    pod_rank = node_ips.index(current_node_ip)
+    pod = cluster.pods[pod_rank]
+
     default_env = os.environ.copy()
     current_env = copy.copy(default_env)
     current_env.pop("http_proxy", None)
@@ -225,68 +312,78 @@ def launch_ps(args):
     procs = []
     cmds = []
     log_fns = []
-    ports = range(start_port, start_port + server_num, 1)
-    default_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    user_endpoints = ""
-    if args.servers == "":
-        user_endpoints = default_endpoints
-    else:
-        user_endpoints = args.servers
-    user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")]
-    user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")]
-    for i in range(server_num):
+    for idx, cur_server in enumerate(pod.servers):
         current_env.update({
-            "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
-            "PADDLE_PORT": user_endpoints_port[i],
+            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": user_endpoints_ips[i]
+            "POD_IP": cur_server.endpoint.split(":")[0]
         })
 
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/serverlog.%d" % (args.log_dir, i), "w")
+            fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
             log_fns.append(fn)
             proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
         else:
             proc = subprocess.Popen(cmd, env=current_env)
-        procs.append(proc)
 
-    for i in range(worker_num):
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = cur_server.rank
+        tp.local_rank = idx
+        tp.log_fn = fn
+        tp.log_offset = 0 if fn else None
+        tp.cmd = cmd
+
+        procs.append(tp)
+
+    for idx, cur_worker in enumerate(pod.workers):
         current_env.update({
-            "PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
+            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(i)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank)
         })
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
+            fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
             log_fns.append(fn)
             proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
         else:
             proc = subprocess.Popen(cmd, env=current_env)
-        procs.append(proc)
+
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = cur_worker.rank
+        tp.local_rank = idx
+        tp.log_fn = fn
+        tp.log_offset = 0 if fn else None
+        tp.cmd = cmd
+
+        procs.append(tp)
 
     # only wait worker to finish here
     for i, proc in enumerate(procs):
-        if i < server_num:
+        if i < len(pod.servers):
             continue
-        procs[i].wait()
+        procs[i].proc.wait()
         if len(log_fns) > 0:
             log_fns[i].close()
 
     print("all workers exit, going to finish parameter server", file=sys.stderr)
-    for i in range(server_num):
+    for i in range(len(pod.servers)):
         if len(log_fns) > 0:
             log_fns[i].close()
-        procs[i].terminate()
+        procs[i].proc.terminate()
     print("all parameter server are killed", file=sys.stderr)
 
 
@@ -303,11 +400,15 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
-    if len(has_ps_args) > 0 or fluid.core.get_cuda_device_count() == 0:
-        logger.info("Run cpu parameter-sever mode.")
+    cuda_device_num = fluid.core.get_cuda_device_count()
+    if len(has_ps_args) > 0 or cuda_device_num == 0:
+        logger.info(
+            "Run parameter-sever cpu mode. pserver args:{}, cuda count:{}".
+            format(has_ps_args, cuda_device_num))
         launch_ps(args)
     elif len(has_collective_args) > 0:
-        logger.info("Run gpu collective mode.")
+        logger.info("Run collective gpu mode. gpu args:{}, cuda count:{}".
+                    format(has_collective_args, cuda_device_num))
         launch_collective(args)
     else:
         logger.warning(
diff --git a/python/paddle/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
similarity index 88%
rename from python/paddle/fleet/launch_utils.py
rename to python/paddle/distributed/fleet/launch_utils.py
index 040e7254f8c5b23465e4c65f27910f773ea62921..350d8ae2b44db3e8f8e6b00d95c2b7a9ca91f88b 100644
--- a/python/paddle/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -142,12 +142,16 @@ class Pod(object):
         self.addr = None
         self.port = None
         self.trainers = []
+        self.servers = []
+        self.workers = []
         self.gpus = []
 
     def __str__(self):
-        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
-            self.rank, self.id, self.addr, self.port, self.gpus,
-            [str(t) for t in self.trainers])
+        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
+            workers:{}".format(self.rank, self.id, self.addr, self.port,
+                               self.gpus, [str(t) for t in self.trainers],
+                               [str(s) for s in self.servers],
+                               [str(w) for w in self.workers])
 
     def __eq__(self, pod):
         if self.rank != pod.rank or \
@@ -168,6 +172,26 @@ class Pod(object):
                                                        pod.trainers[i]))
                 return False
 
+        if len(self.servers) != len(pod.servers):
+            logger.debug("servers {} != {}".format(self.servers, pod.servers))
+            return False
+
+        for i in range(len(self.servers)):
+            if self.servers[i] != pod.servers[i]:
+                logger.debug("servers {} != {}".format(self.servers[i],
+                                                       pod.servers[i]))
+                return False
+
+        if len(self.workers) != len(pod.workers):
+            logger.debug("workers {} != {}".format(self.workers, pod.workers))
+            return False
+
+        for i in range(len(self.workers)):
+            if self.workers[i] != pod.workers[i]:
+                logger.debug("workers {} != {}".format(self.workers[i],
+                                                       pod.workers[i]))
+                return False
+
         return True
 
     def __ne__(self, pod):
@@ -303,6 +327,17 @@ def find_free_ports(num):
     return None
 
 
+def get_ports(num, offset):
+    if os.environ.get('FLAGS_START_PORT') is None:
+        ports = find_free_ports(num)
+        if ports is not None:
+            ports = list(ports)
+    else:
+        start_port = os.environ.get('FLAGS_START_PORT')
+        ports = range(start_port + offset, start_port + offset + num, 1)
+    return ports
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
diff --git a/python/paddle/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
old mode 100755
new mode 100644
similarity index 80%
rename from python/paddle/fleet/meta_optimizers/__init__.py
rename to python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 718805c5aadaf3476fa1fc495a355395fec6396d..075e8b6c4302d792606849fc2981e46ccead1e56
--- a/python/paddle/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -15,17 +15,24 @@ from .amp_optimizer import AMPOptimizer
 from .recompute_optimizer import RecomputeOptimizer
 from .gradient_merge_optimizer import GradientMergeOptimizer
 from .graph_execution_optimizer import GraphExecutionOptimizer
+from .async_optimizer import AsyncMetaOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
+from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
 from .dgc_optimizer import DGCOptimizer
+from .lamb_optimizer import LambOptimizer
 
 __all__ = [
     'AMPOptimizer',
     'RecomputeOptimizer',
     'GradientMergeOptimizer',
+    'AsyncMetaOptimizer',
+    'GraphExecutionOptimizer',
     'PipelineOptimizer',
     'LocalSGDOptimizer',
     'LarsOptimizer',
+    'AsyncGraphExecutionOptimizer',
     'DGCOptimizer',
+    'LambOptimizer',
 ]
diff --git a/python/paddle/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
similarity index 89%
rename from python/paddle/fleet/meta_optimizers/amp_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 8316d807fa87062a8e3fba0bcb3bd057d2231032..66db14209b4c57475c30c6dde083593e27f04ea0 100644
--- a/python/paddle/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -23,7 +23,12 @@ class AMPOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.amp_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
+            "LocalSGDOptimizer", "GradientMergeOptimizer",
+            "GraphExecutionOptimizer"
+        ]
+        self.meta_optimizers_black_list = ["DGCOptimizer"]
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -37,6 +42,7 @@ class AMPOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.amp = False
+        dist_strategy.amp_configs = {}
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0dee220aafd07bf69a198c6b03e6c957c50d4ce
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle import fluid
+from paddle.fluid import compiler
+from .async_optimizer import AsyncMetaOptimizer
+
+
+class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
+    def __init__(self, optimizer):
+        super(AsyncGraphExecutionOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+
+    def _can_apply(self):
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        if k_steps < 0:
+            return False
+
+        if self.role_maker.is_server():
+            return False
+
+        return True
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.a_sync_configs = {}
+
+    def _is_graph_out(self):
+        return True
+
+    def _try_to_compile(self, main_program, loss):
+        dist_strategy = self._get_distributed_strategy()
+
+        build_strategy = dist_strategy.get_build_strategy()
+        exec_strategy = dist_strategy.get_execute_strategy()
+
+        self._compiled_program = compiler.CompiledProgram(main_program)
+
+        self._compiled_program.with_data_parallel(
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy,
+            share_vars_from=None)
+
+        return self._compiled_program
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        program = loss.block.program
+        compiled_program = self._try_to_compile(program, loss)
+        program._graph = compiled_program
+        # just return self.optimizer_ops and self.param_grads
+        return None, None
diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b65435497284d279ebdea026e7ac88883a724c7c
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle import fluid
+from .meta_optimizer_base import MetaOptimizerBase
+
+
+class AsyncMetaOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(AsyncMetaOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+
+    def _is_graph_out(self):
+        return False
+
+    def _can_apply(self):
+        if self.role_maker._is_collective:
+            return False
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        return True if k_steps >= 0 else False
+
+    def _get_distributed_strategy(self):
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        strategy = None
+
+        if not self.user_defined_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_sync_strategy()
+
+        if self.user_defined_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_async_strategy()
+
+        if self.user_defined_strategy.a_sync and k_steps > 0:
+            strategy = StrategyFactory.create_geo_strategy(k_steps)
+
+        if not strategy:
+            raise ValueError("k_steps must be invalid value, please check")
+
+        return strategy
+
+    def _build_trainer_programs(self, compiled_config):
+        from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker
+
+        _main = compiled_config.origin_main_program.clone()
+        _startup = compiled_config.origin_startup_program.clone()
+
+        if not compiled_config.is_geo_mode():
+            # for main program
+            _main = worker.delete_optimizer_pass(_main, compiled_config)
+            _main = worker.distributed_ops_pass(_main, compiled_config)
+            _main = worker.append_send_ops_pass(_main, compiled_config)
+
+            # for startup program
+            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
+            _startup = worker.init_from_server_pass(_startup, compiled_config)
+            _startup = worker.delet_extra_optimizes_pass(_startup,
+                                                         compiled_config)
+        else:
+            _main = worker.append_send_ops_pass(_main, compiled_config)
+            _startup = _startup
+
+        return _main, _startup
+
+    def _build_pserver_programs(self, compiled_config):
+        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
+
+        _main = fluid.Program()
+        _startup = fluid.Program()
+
+        if not compiled_config.is_geo_mode():
+            _main = server.add_listen_and_serv_pass(_main, compiled_config)
+            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
+            _main = server.add_optimizer_pass(_main, compiled_config)
+            _main = server.large_scale_sparse_pass(_main, _main,
+                                                   compiled_config, False)
+            _startup = server.build_pserver_startup_program_pass(
+                _startup, _main, compiled_config)
+            _startup = server.large_scale_sparse_pass(_startup, _main,
+                                                      compiled_config, True)
+
+            if not compiled_config.is_sync_mode():
+                _main = server.delete_unused_in_main_pass(_main,
+                                                          compiled_config)
+
+            _startup = server.delete_unused_in_startup_pass(_startup, _main,
+                                                            compiled_config)
+        else:
+            _main = server.add_listen_and_serv_pass(_main, compiled_config)
+            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
+            _main = server.add_geo_optimizer_pass(_main, compiled_config)
+            _main = server.large_scale_sparse_pass(_main, _main,
+                                                   compiled_config, False)
+            _startup = server.build_pserver_startup_program_pass(
+                _startup, _main, compiled_config)
+            _startup = server.large_scale_sparse_pass(_startup, _main,
+                                                      compiled_config, True)
+            _startup = server.delete_unused_in_startup_pass(_startup, _main,
+                                                            compiled_config)
+
+        return _main, _startup
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.inner_opt.minimize(loss, startup_program, parameter_list,
+                                no_grad_set)
+        strategy = self._get_distributed_strategy()
+
+        _origin_main_program = loss.block.program
+        _origin_startup_program = startup_program
+        from paddle.fluid.incubate.fleet.parameter_server.ir import public as public
+
+        compiled_config = public.CompileTimeStrategy(_origin_main_program,
+                                                     _origin_startup_program,
+                                                     strategy, self.role_maker)
+
+        main_program, startup_program = \
+            self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \
+                else self._build_pserver_programs(compiled_config)
+
+        loss.block.program = main_program
+        fluid.framework.switch_startup_program(startup_program)
+
+        return None, None
+
+    def _disable_strategy(self, dist_strategy):
+        self.user_defined_strategy.a_sync_configs = {}
diff --git a/python/paddle/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
similarity index 100%
rename from python/paddle/fleet/meta_optimizers/common.py
rename to python/paddle/distributed/fleet/meta_optimizers/common.py
diff --git a/python/paddle/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
similarity index 95%
rename from python/paddle/fleet/meta_optimizers/dgc_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index c9a28fdaf11dd0d4d45cfd3fb1904b80dc136711..f34786f9dc309dd1f03319368bbc93ef1bfc03e3 100644
--- a/python/paddle/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -25,6 +25,7 @@ class DGCOptimizer(MetaOptimizerBase):
         self.dgc_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -68,11 +69,7 @@ class DGCOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.dgc = False
-        dist_strategy.dgc_configs = {
-            'rampup_begin_step': 0,
-            'rampup_step': 1,
-            'sparsity': [0.999]
-        }
+        dist_strategy.dgc_configs = {}
 
     def backward(self,
                  loss,
diff --git a/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
similarity index 88%
rename from python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 668cf605defaf5eb3f4e205c5a18548e45449a9c..bd52179a35862241768ad5bd01eedf16732ad3b6 100644
--- a/python/paddle/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -16,13 +16,20 @@ from .meta_optimizer_base import MetaOptimizerBase
 
 __all__ = ["GradientMergeOptimizer"]
 
+# amp + gradient merge + lamb
+
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(GradientMergeOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.wrapped_opt = GM(optimizer)
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "GraphExecutionOptimizer",
+        ]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -40,7 +47,7 @@ class GradientMergeOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.gradient_merge = False
-        dist_strategy.gradient_merge_configs = {"k_steps": 1, "avg": True}
+        dist_strategy.gradient_merge_configs = {}
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
similarity index 93%
rename from python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 0473f7c1d689fb9cc2fc856a41076d0ab68baf0d..ace31687338f918ef260b3134b0bd429795542d0 100644
--- a/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -25,6 +25,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _is_graph_out(self):
         return True
@@ -119,18 +120,26 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         local_build_strategy.nccl_comm_num = \
                     dist_strategy.nccl_comm_num
 
+        if self.user_defined_strategy.recompute == True:
+            logging.warn(
+                "set enable_sequential_execution=True since you have enable the recompute strategy"
+            )
+            local_build_strategy.enable_sequential_execution = True
+
         exe_strategy = self.user_defined_strategy.execution_strategy
-        node_num = self.role_maker.worker_num()
+        worker_num = self.role_maker.worker_num()
+        node_num = self.role_maker.node_num()
 
         if self.role_maker._is_collective:
-            assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
+            assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
 
-        if node_num <= 1:
+        if worker_num <= 1:
             # local mode
             if local_build_strategy.nccl_comm_num > 1:
                 logging.warn("set nccl_comm_num=1 since you only have 1 node.")
             local_build_strategy.nccl_comm_num = 1
 
+        if node_num <= 1:
             if local_build_strategy.use_hierarchical_allreduce:
                 logging.warn(
                     "set hierachical_allreduce=False since you only have 1 node."
@@ -190,7 +199,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
                  parameter_list=None,
                  no_grad_set=None):
         if startup_program == None:
-            startup_program = paddle.default_startup_program()
+            startup_program = paddle.static.default_startup_program()
         compiled_program = self._try_to_compile(startup_program,
                                                 loss.block.program, loss)
         loss.block.program._graph = compiled_program
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..7e08a02eb1dc2e14b1871fe7743bbee8ade3feb3
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.optimizer import LambOptimizer as LAMB
+from .meta_optimizer_base import MetaOptimizerBase
+import logging
+
+__all__ = ["LambOptimizer"]
+
+
+class LambOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(LambOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.lamb_opt = None
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = []
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(LambOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+
+        opt = self.inner_opt
+        if not isinstance(opt, AdamOptimizer):
+            return
+
+        configs = self.user_defined_strategy.lamb_configs
+        if len(configs['exclude_from_weight_decay']) == 0:
+            _exclude_from_weight_decay_fn = None
+        else:
+
+            def exclude_fn(param):
+                exclude_list = configs['exclude_from_weight_decay']
+                for name in exclude_list:
+                    if param.name.endswith(name):
+                        return True
+                return False
+
+            _exclude_from_weight_decay_fn = exclude_fn
+
+        self.lamb_opt = LAMB(
+            learning_rate=opt._learning_rate,
+            lamb_weight_decay=configs['lamb_weight_decay'],
+            beta1=opt._beta1,
+            beta2=opt._beta2,
+            epsilon=opt._epsilon,
+            parameter_list=opt._parameter_list,
+            regularization=opt.regularization,
+            grad_clip=opt._grad_clip,
+            exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn,
+            name=opt._name)
+
+    def _can_apply(self):
+        if self.user_defined_strategy.lamb:
+            if not isinstance(self.inner_opt, AdamOptimizer):
+                logging.warn(
+                    "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".
+                    format(self.inner_opt.type))
+                return False
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.lamb = False
+        dist_strategy.lamb_configs = {}
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self.lamb_opt.backward(loss, startup_program, parameter_list,
+                                      no_grad_set, callbacks)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        optimize_ops, params_grads = \
+            self.lamb_opt.minimize(loss, startup_program,
+                                      parameter_list, no_grad_set)
+        return optimize_ops, params_grads
diff --git a/python/paddle/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
similarity index 94%
rename from python/paddle/fleet/meta_optimizers/lars_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index ff535e3ebf259cf646cb9649ee45acc409a8d0d7..09c418fa79106d05cffae1e8bc18fac9c0cc8f34 100755
--- a/python/paddle/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -24,7 +24,8 @@ class LarsOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.lars_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -58,10 +59,7 @@ class LarsOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.lars = False
-        dist_strategy.lars_configs = {
-            'lars_coeff': 0.001,
-            'lars_weight_decay': 0.0005,
-        }
+        dist_strategy.lars_configs = {}
 
     def backward(self,
                  loss,
diff --git a/python/paddle/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
similarity index 67%
rename from python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 05a120f8163755ad0effeccfe729f88782cfeebe..e22127c13999bfde7aa753ad1a66536913ab04f9 100644
--- a/python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from paddle.fluid import program_guard, layers
+from paddle.fluid import program_guard, layers, default_main_program
 from paddle.fluid.optimizer import Momentum, SGD
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
@@ -25,6 +25,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer"]
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
@@ -39,11 +40,35 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.localsgd = False
-        dist_strategy.localsgd_configs = {'k_steps': 1}
+        dist_strategy.localsgd_configs = {}
 
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
 
+    def create_snapshot_vars(self, program):
+        block = program.global_block()
+
+        non_dist_params = []
+        for param in block.iter_parameters():
+            if not param.is_distributed:
+                non_dist_params.append(param)
+
+        p2s = []
+        for param in non_dist_params:
+            snapshot = block.create_var(
+                name=self.snapshot_name(param.name),
+                shape=param.shape,
+                persistable=True,
+                stop_gradient=True,
+                dtype=param.dtype)
+            p2s.append([param, snapshot])
+        return p2s
+
+    def init_snapshot_vars(self, startup_program, param2snapshot):
+        with program_guard(startup_program):
+            for param, snapshot in param2snapshot:
+                layers.assign(param, snapshot)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -62,8 +87,11 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         self.nrings = 2
         collective_helper = CollectiveHelper(self.role_maker, self.nrings)
         collective_helper.update_startup_program(startup_program)
+        p2s = self.create_snapshot_vars(startup_program)
+        self.init_snapshot_vars(startup_program, p2s)
 
-        with program_guard(main_block.program):
+        p2s = self.create_snapshot_vars(main_block.program)
+        with program_guard(main_block.program, startup_program):
             step = layers.autoincreased_step_counter(begin=0)
             k_steps = layers.create_global_var(
                 name="k_steps",
@@ -79,6 +107,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                 persistable=True)
 
             if auto_steps:
+                avg_loss = layers.collective._c_allreduce(
+                    loss) / self.role_maker.worker_num()
+
                 lr_0 = layers.create_global_var(
                     name="lr_0",
                     shape=[1],
@@ -101,49 +132,32 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                 layers.cond(step == 0, initialize)
 
             def communicate():
-                ordered_param_snapshot = []
+                sub_block = default_main_program().current_block()
                 ring_id = -1
-                for idx, op in reversed(list(enumerate(main_block.ops))):
-                    if is_update_op(op):
-                        param = main_block.vars[op.input('Param')[0]]
-                        if param.is_distributed:
-                            continue
-
-                        snapshot = main_block.create_var(
-                            name=self.snapshot_name(param.name),
-                            shape=param.shape,
-                            persistable=True,
-                            stop_gradient=True,
-                            dtype=param.dtype)
-
-                        main_block._insert_op(
-                            idx + 1,
-                            type='elementwise_sub',
-                            inputs={'X': [snapshot],
-                                    'Y': [param]},
-                            outputs={'Out': [param]},
-                            attrs={OP_ROLE_KEY: OpRole.Optimize})
-                        main_block._insert_op(
-                            idx + 2,
-                            type='c_sync_calc_stream',
-                            inputs={'X': param},
-                            outputs={'Out': param},
-                            attrs={OP_ROLE_KEY: OpRole.Optimize})
-                        ring_id = (ring_id + 1) % self.nrings
-                        main_block._insert_op(
-                            idx + 3,
-                            type='c_allreduce_sum',
-                            inputs={'X': [param]},
-                            outputs={'Out': [param]},
-                            attrs={
-                                'ring_id': ring_id,
-                                OP_ROLE_KEY: OpRole.Optimize
-                            })
-
-                        ordered_param_snapshot.append((param, snapshot))
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='c_sync_calc_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    ring_id = (ring_id + 1) % self.nrings
+                    sub_block.append_op(
+                        type='c_allreduce_sum',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
 
                 for ring_id in range(self.nrings):
-                    main_block.append_op(
+                    sub_block.append_op(
                         type='c_sync_comm_stream',
                         inputs={'X': param},
                         outputs={'Out': param},
@@ -152,10 +166,8 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                             OP_ROLE_KEY: OpRole.Optimize
                         })
 
-                for param_snapshot in reversed(ordered_param_snapshot):
-                    param = param_snapshot[0]
-                    snapshot = param_snapshot[1]
-                    main_block.append_op(
+                for param, snapshot in p2s:
+                    sub_block.append_op(
                         type='scale',
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
@@ -163,13 +175,13 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                             'scale': 1.0 / self.role_maker.worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
-                    main_block.append_op(
+                    sub_block.append_op(
                         type='elementwise_sub',
                         inputs={'X': [snapshot],
                                 'Y': [param]},
                         outputs={'Out': [param]},
                         attrs={OP_ROLE_KEY: OpRole.Optimize})
-                    main_block.append_op(
+                    sub_block.append_op(
                         type='assign',
                         inputs={'X': [param]},
                         outputs={'Out': [snapshot]},
diff --git a/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
similarity index 58%
rename from python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
rename to python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 9ba184fb0089589a86d6444d12cf402b9687b041..12a4d904340337bf9a99968c7d82db117bf59ce8 100644
--- a/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -14,10 +14,16 @@
 
 __all__ = ["MetaOptimizerBase"]
 
+from paddle.fluid.optimizer import Optimizer
 
-class MetaOptimizerBase(object):
+
+class MetaOptimizerBase(Optimizer):
     def __init__(self, optimizer):
-        pass
+        self.inner_opt = optimizer
+        self._learning_rate = self.inner_opt._learning_rate
+        self._learning_rate_map = self.inner_opt._learning_rate_map
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -26,7 +32,7 @@ class MetaOptimizerBase(object):
         self.user_defined_optimizer = user_defined_optimizer
         self.user_defined_strategy = user_defined_strategy
 
-    def _update_inner_optimier(self, optimizer):
+    def _update_inner_optimizer(self, optimizer):
         self.inner_opt = optimizer
 
     def _can_apply(self):
@@ -38,17 +44,43 @@ class MetaOptimizerBase(object):
     def _can_update(self, optimizer):
         if str(optimizer.__class__.__name__) in self.meta_optimizers_white_list:
             return True
+        return False
 
     def _disable_strategy(self, dist_strategy):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
+    def apply_gradients(self, params_grads):
+        return self.inner_opt.apply_gradients(params_grads=params_grads)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self.inner_opt.backward(loss, startup_program, parameter_list,
+                                       no_grad_set, callbacks)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.inner_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        raise NotImplementedError("meta optimizer not implemented")
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
+        return optimize_ops, params_grads
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9221307cbacfa1beaf030b70a4e4b9223769cc
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -0,0 +1,225 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from ..base.private_helper_function import wait_server_ready
+from paddle.fluid.optimizer import PipelineOptimizer as PO
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+__all__ = ["PipelineOptimizer"]
+
+
+class PipelineHelper(CollectiveHelper):
+    def __init__(self, role_maker, nrings=1, wait_port='6174'):
+        super(PipelineHelper, self).__init__(role_maker, nrings, wait_port)
+
+    def _init_communicator(self, program, current_endpoint, endpoints, rank,
+                           ring_id, wait_port):
+        nranks = len(endpoints)
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            wait_server_ready(other_endpoints)
+
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=unique_name.generate('nccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+                OP_ROLE_KEY: OpRole.Forward
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': ring_id,
+                OP_ROLE_KEY: OpRole.Forward,
+                'device_id': OpRole.Forward
+            })
+
+    def _broadcast_params(self):
+        block = self.startup_program.global_block()
+        ring_id = 0
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        for ring_id in range(self.nrings):
+            block.append_op(
+                type='c_sync_comm_stream',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={'ring_id': ring_id,
+                       OP_ROLE_KEY: OpRole.Forward})
+
+
+class PipelineOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(PipelineOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = []
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(PipelineOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        num_microbatches = user_defined_strategy.pipeline_configs['micro_batch']
+        self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
+
+    def _can_apply(self):
+        if self.user_defined_strategy.pipeline == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.pipeline = False
+        dist_strategy.pipeline_configs = {}
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        optimize_ops, params_grads, prog_list = \
+            self.wrapped_opt.minimize(loss, startup_program,
+                                      parameter_list, no_grad_set)
+        if self.role_maker.worker_num() == 1:
+            return optimize_ops, params_grads
+
+        endpoints = self.role_maker.get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker.worker_index()]
+        self.startup_program = startup_program
+        if startup_program is None:
+            self.startup_program = fluid.default_startup_program()
+
+        assert prog_list
+        self.main_program_list = prog_list
+        self.main_program = loss.block.program
+        nranks = len(endpoints)
+        self.nranks = nranks
+        self.nrings = len(self.main_program_list)
+
+        self.rank = self.role_maker.worker_index()
+        self.endpoints = endpoints
+        self.current_endpoint = current_endpoint
+
+        pipeline_helper = PipelineHelper(self.role_maker, nrings=self.nrings)
+        pipeline_helper.update_startup_program(self.startup_program)
+
+        self._transpile_main_program()
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self):
+        self._insert_loss_grad_ops()
+        for ring_id in range(self.nrings):
+            self._insert_allreduce_ops(ring_id)
+
+    def _insert_loss_grad_ops(self):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = self.main_program_list[self.nrings - 1]['program'].global_block(
+        )
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / self.nranks,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+
+    def _insert_allreduce_ops(self, ring_id):
+        block = self.main_program_list[ring_id]['program'].global_block()
+        origin_block = self.main_program.global_block()
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and \
+                OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    origin_param = origin_block.vars[op_role_var[i]]
+                    if origin_param.is_distributed:
+                        continue
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        offset += 1
+
+                    block._insert_op(
+                        offset,
+                        type='c_sync_calc_stream',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx + ring_id,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+            break
diff --git a/python/paddle/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
similarity index 85%
rename from python/paddle/fleet/meta_optimizers/recompute_optimizer.py
rename to python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 73119d81094ac611c0d3545b59342b5dbd8b5d16..45130b447125f6ecbade2e4e5e3dad2f127fda52 100644
--- a/python/paddle/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -24,13 +24,20 @@ class RecomputeOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.wrapped_opt = RO(optimizer)
         # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+        ]
+        self.meta_optimizers_black_list = []
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(RecomputeOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        self.wrapped_opt._set_checkpoints([])
+        self.wrapped_opt._set_checkpoints(
+            list(user_defined_strategy.recompute_configs["checkpoints"]))
 
     def _can_apply(self):
         if self.user_defined_strategy.recompute == True:
@@ -42,7 +49,7 @@ class RecomputeOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.recompute = False
-        dist_strategy.recompute_configs = {"checkpoints": []}
+        dist_strategy.recompute_configs = {}
 
     def backward(self,
                  loss,
diff --git a/python/paddle/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
similarity index 100%
rename from python/paddle/fleet/metrics/__init__.py
rename to python/paddle/distributed/fleet/metrics/__init__.py
diff --git a/python/paddle/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
similarity index 95%
rename from python/paddle/fleet/metrics/metric.py
rename to python/paddle/distributed/fleet/metrics/metric.py
index 152ee21c147b01e549257bf8821c5c656ee81d0d..12a24292e5a3ad9ea838d9451fdf72e7e846a528 100644
--- a/python/paddle/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -43,7 +43,7 @@ def sum(input, scope=None):
           
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("sum array: ", paddle.fleet.sum(res))
+          print("sum array: ", paddle.distributed.fleet.sum(res))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -82,7 +82,7 @@ def max(input, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("max array: ", paddle.fleet.max(res))
+          print("max array: ", paddle.distributed.fleet.max(res))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -121,7 +121,7 @@ def min(input, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("min array: ", paddle.fleet.min(res))
+          print("min array: ", paddle.distributed.fleet.min(res))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -162,7 +162,7 @@ def auc(stat_pos, stat_neg, scope=None):
           # in train.py, after train or infer
           pos = np.array(scope.find_var(stat_pos.name).get_tensor())
           neg = np.array(scope.find_var(stat_neg.name).get_tensor())
-          print("auc: ", paddle.fleet.auc(pos, neg))
+          print("auc: ", paddle.distributed.fleet.auc(pos, neg))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -240,7 +240,7 @@ def mae(abserr, total_ins_num, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(abserr.name).get_tensor())
-          print("mae: ", paddle.fleet.mae(res, total_ins_num))
+          print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -278,7 +278,7 @@ def rmse(sqrerr, total_ins_num, scope=None):
 
           # in train.py, after train or infer
           res = np.array(scope.find_var(sqrerr.name).get_tensor())
-          print("rmse: ", paddle.fleet.rmse(res, total_ins_num))
+          print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -316,7 +316,7 @@ def mse(sqrerr, total_ins_num, scope=None):
 
           # in train.py, after train or infer
           metric = np.array(scope.find_var(sqrerr.name).get_tensor())
-          print("mse: ", paddle.fleet.mse(metric, total_ins_num))
+          print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
@@ -365,7 +365,7 @@ def acc(correct, total, scope=None):
           # in train.py, after train or infer
           correct_num = np.array(scope.find_var(correct.name).get_tensor())
           total_num = np.array(scope.find_var(total.name).get_tensor())
-          print("accuracy: ", paddle.fleet.acc(correct_num, total_num))
+          print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
     """
     fleet._role_maker._barrier_worker()
     if scope is None:
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a796a73fc981b7edbcd57e8f5858456031e7ae6e
--- /dev/null
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .collective_runtime import CollectiveRuntime
+from .parameter_server_runtime import ParameterServerRuntime
+
+__all__ = ["CollectiveRuntime," "ParameterServerRuntime", ]
diff --git a/python/paddle/fleet/runtime/collective_runtime.py b/python/paddle/distributed/fleet/runtime/collective_runtime.py
similarity index 97%
rename from python/paddle/fleet/runtime/collective_runtime.py
rename to python/paddle/distributed/fleet/runtime/collective_runtime.py
index 0881c4b52c822908cedc94d3f4de088eed6c65e8..c56cf4c7aa2ed86f4529b1bb09d51ce64d86cfc8 100644
--- a/python/paddle/fleet/runtime/collective_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/collective_runtime.py
@@ -30,7 +30,7 @@ class CollectiveRuntime(RuntimeBase):
             "You should not call 'run_worker' method for collective mode.")
         pass
 
-    def _init_server(self):
+    def _init_server(self, *args, **kwargs):
         logging.warn(
             "You should not call 'init_server' method for collective mode.")
         pass
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c731ed08893348d0be604eb383905cd4a9d6e228
--- /dev/null
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -0,0 +1,555 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.framework import Program
+from paddle.fluid.compiler import CompiledProgram
+from paddle.fluid.executor import Executor
+from paddle.fluid.parallel_executor import ParallelExecutor
+
+from .runtime_base import RuntimeBase
+
+
+class ParameterServerRuntime(RuntimeBase):
+    def __init__(self):
+        super(ParameterServerRuntime, self).__init__()
+        self._communicator = None
+
+    def _set_basic_info(self, context):
+        self.context = context
+        self.role_maker = context["role_maker"]
+        self.origin_main_program = context["origin_main_program"]
+        self.origin_startup_program = context["origin_startup_program"]
+        self.async_strategy = self._get_distributed_strategy()
+        self.compiled_strategy = self.build_compiled_startegy()
+
+    def _get_distributed_strategy(self):
+        strategy = None
+
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+        dist_strategy = self.context["valid_strategy"]
+        k_steps = dist_strategy.a_sync_configs["k_steps"]
+
+        if not dist_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_sync_strategy()
+
+        if dist_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_async_strategy()
+
+        if dist_strategy.a_sync and k_steps > 0:
+            strategy = StrategyFactory.create_geo_strategy(k_steps)
+
+        if not strategy:
+            raise ValueError("k_steps must be invalid value, please check")
+
+        return strategy
+
+    def build_compiled_startegy(self):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy
+
+        compiled_config = CompileTimeStrategy(
+            self.origin_main_program, self.origin_main_program,
+            self.async_strategy, self.role_maker)
+        return compiled_config
+
+    def _load_sparse_params(self, dirname, varnames):
+        from paddle.fluid.communicator import LargeScaleKV
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
+
+        scale_kv = LargeScaleKV()
+        for varname in varnames:
+            origin_varname, _, _ = _get_varname_parts(varname)
+            sparse_dir = os.path.join(dirname, origin_varname, varname)
+            scale_kv.load(varname, sparse_dir)
+
+    @staticmethod
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
+
+            origin_varname, _, _ = _get_varname_parts(var.name)
+            if origin_varname.endswith("@GRAD"):
+                return False
+
+            if origin_varname == "learning_rate_0":
+                return False
+
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                            var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    def _init_worker(self):
+        def sync_strategy_envs():
+            kwargs = {}
+            kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
+            )
+            kwargs["trainer_id"] = self.role_maker.worker_index()
+            return kwargs
+
+        def geo_strategy_envs():
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
+
+            def get_sparse_attrs():
+                opt_init_map = {}
+                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
+                opt_init_map["fill_constant"] = ["value"]
+                opt_init_map["uniform_random"] = ["seed", "min", "max"]
+                opt_init_map[
+                    "truncated_gaussian_random"] = ["seed", "mean", "std"]
+
+                dist_varnames = get_sparse_tablenames(self.origin_main_program,
+                                                      True)
+                sparse_varnames = get_sparse_tablenames(
+                    self.origin_main_program, False)
+
+                if len(dist_varnames) != 0:
+                    raise ValueError(
+                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
+                    )
+
+                init_attrs = []
+                for value_name in sparse_varnames:
+                    value_var = self.origin_main_program.global_block().vars[
+                        value_name]
+                    value_attr = [
+                        value_name,
+                        ",".join([str(dim) for dim in value_var.shape])
+                    ]
+                    for op in self.origin_startup_program.global_block().ops:
+                        if op.type in opt_init_map.keys(
+                        ) and value_name == op.output("Out")[0]:
+                            init_attr = [op.type]
+                            for attr in opt_init_map[op.type]:
+                                init_attr.append(str(op.attr(attr)))
+                            value_attr.append("&".join(init_attr))
+                            init_attrs.append(":".join(value_attr))
+                            break
+                return "#".join(init_attrs)
+
+            kwargs = {}
+            kwargs["trainers"] = self.role_maker.worker_num()
+            kwargs["sparse_attrs"] = get_sparse_attrs()
+            return kwargs
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
+            SyncStrategy, GeoStrategy
+
+        trainer_config = self.async_strategy.get_trainer_runtime_config()
+        lrs = _get_lr_ops(self.origin_main_program)
+
+        if len(lrs) > 0:
+            kwargs = {"need_global_step": "1"}
+        else:
+            kwargs = {"need_global_step": "0"}
+
+        if isinstance(self.async_strategy, GeoStrategy):
+            geo_kwargs = geo_strategy_envs()
+            kwargs.update(geo_kwargs)
+        if isinstance(self.async_strategy, SyncStrategy):
+            sync_kwargs = sync_strategy_envs()
+            kwargs.update(sync_kwargs)
+
+        kwargs = kwargs if kwargs else None
+
+        send_ctx = self.compiled_strategy.get_communicator_send_context()
+
+        if self.compiled_strategy.is_geo_mode():
+            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
+                recv_type=4)
+        else:
+            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
+                recv_type=1)
+
+        from paddle.fluid.communicator import Communicator
+        self._communicator = Communicator(
+            trainer_config.mode, kwargs,
+            trainer_config.get_communicator_flags())
+        self._communicator.init_with_ctx(send_ctx, recv_ctx)
+
+        if not self._communicator.is_running():
+            self._communicator.start()
+        else:
+            warnings.warn("communicator has been initialized, skip")
+
+    def _init_server(self, *args, **kwargs):
+        if len(args) > 1:
+            raise ValueError("init server can only accept 1 args: `dirname`")
+        elif len(args) == 1:
+            model_dirname = args[0]
+        else:
+            model_dirname = None
+
+        executor = fluid.Executor(fluid.CPUPlace())
+        executor.run(fluid.default_startup_program())
+
+        if not model_dirname:
+            return
+
+        if not os.path.isdir(model_dirname):
+            raise ValueError("There is no directory named '%s'", model_dirname)
+
+        sparse_varnames = self.compiled_strategy.get_sparse_varname_on_ps(True)
+
+        distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
+            False)
+
+        remaining_vars = list(
+            filter(
+                ParameterServerRuntime.__exclude_vars(sparse_varnames +
+                                                      distribtued_varnames),
+                fluid.default_main_program().list_vars()))
+
+        fluid.io.load_vars(
+            executor,
+            main_program=fluid.default_main_program(),
+            dirname=model_dirname,
+            vars=remaining_vars)
+
+        self._load_sparse_params(
+            dirname=model_dirname, varnames=sparse_varnames)
+
+        # todo(tangwei12) load distributed vars
+        # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
+
+    def _run_server(self):
+        executor = fluid.Executor(fluid.CPUPlace())
+        executor.run(fluid.default_main_program())
+
+    def _stop_worker(self):
+        self._communicator.stop()
+        executor = fluid.Executor(fluid.CPUPlace())
+        executor.close()
+
+    def _get_optimizer_status(self, op, param_name):
+        supported_opts = [
+            "sgd", "adam", "adagrad", "adamax", "momentum", "lars_momentum",
+            "rmsprop", "decayed_adagrad", "ftrl"
+        ]
+
+        reshaped_val_map = {}
+        reshaped_val_map["sgd"] = []
+        reshaped_val_map["adam"] = ["moment1_0", "moment2_0"]
+        reshaped_val_map["adagrad"] = ["moment_0"]
+        reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
+        reshaped_val_map["momentum"] = ["velocity_0"]
+        reshaped_val_map["lars_momentum"] = ["velocity_0"]
+        reshaped_val_map[
+            "rmsprop"] = ["momentum_0", "mean_square_0", "mean_grad_0"]
+        reshaped_val_map["decayed_adagrad"] = ["moment_0"]
+        reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]
+
+        orishaped_val_map = {}
+        orishaped_val_map["adam"] = ["beta1_pow_acc_0", "beta2_pow_acc_0"]
+        orishaped_val_map["adamax"] = ["beta1_pow_acc_0"]
+
+        if op not in supported_opts:
+            raise ValueError(
+                "fleet can not support optimizer: {}, only this can be supported: {}".
+                format(op, supported_opts))
+
+        reshaped_names = [
+            param_name + "_" + val for val in reshaped_val_map[op]
+        ]
+
+        if op not in orishaped_val_map:
+            origin_names = []
+        else:
+            origin_names = [
+                param_name + "_" + val for val in orishaped_val_map[op]
+            ]
+        return reshaped_names, origin_names
+
+    def _get_optimizer_op(self, param_name):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
+
+        opts = _get_optimize_ops(self.origin_main_program)
+        for op in opts:
+            if "Param" in op.input_names and \
+                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                return op
+
+    def _save_dense_params(self, executor, dirname, context, main_program):
+        self._communicator.recv()
+
+        prog = Program()
+        block = prog.global_block()
+        local_vars = []
+
+        for name, var_ctx in context.items():
+            if len(var_ctx.origin_varnames()) != 1:
+                raise ValueError("Dense can not support split now.")
+
+            varname = var_ctx.origin_varnames()[0]
+            local_vars.append(varname)
+
+            optimizer = self._get_optimizer_op(varname)
+            reshaped_varnames, origin_varnames = self._get_optimizer_status(
+                optimizer.type, varname)
+
+            for var_name in [varname] + reshaped_varnames + origin_varnames:
+                var = self.origin_main_program.global_block().vars[var_name]
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes":
+                        [",".join([str(i) for i in var.shape])],
+                        "slice_varnames": [var.name],
+                        "remote_varnames": [var.name],
+                        "is_sparse": False,
+                        "endpoints": var_ctx.split_endpoints(),
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+
+        executor.run(prog)
+        return local_vars
+
+    def _save_sparse_params(self, executor, dirname, context, main_program):
+        prog = Program()
+        block = prog.global_block()
+        local_vars = []
+
+        for name, var_ctx in context.items():
+            if len(var_ctx.origin_varnames()) != 1:
+                raise ValueError("Dense can not support split now.")
+
+            varname = var_ctx.origin_varnames()[0]
+            local_vars.append(varname)
+
+            optimizer = self._get_optimizer_op(varname)
+            reshaped_varnames, origin_varnames = self._get_optimizer_status(
+                optimizer.type, varname)
+
+            var = self.origin_main_program.global_block().vars[varname]
+            slice_shapes = []
+            dims1 = ",".join([str(i) for i in var.shape[1:]])
+
+            for section in var_ctx.sections():
+                slice_shapes.append(str(section) + dims1)
+
+            block.append_op(
+                type='recv_save',
+                attrs={
+                    "trainer_id": self.role_maker.worker_index(),
+                    "shape": var.shape,
+                    "slice_shapes": slice_shapes,
+                    "slice_varnames": var_ctx.split_varnames(),
+                    "remote_varnames": var_ctx.split_varnames(),
+                    "is_sparse": True,
+                    "endpoints": var_ctx.split_endpoints(),
+                    "pserver_num": len(self.role_maker.get_pserver_endpoints()),
+                    "file_path": os.path.join(dirname, var.name)
+                })
+
+            for reshaped_varname in reshaped_varnames:
+                var = self.origin_main_program.global_block().vars[
+                    reshaped_varname]
+
+                slice_varnames = []
+                remote_varnames = []
+                for i in range(len(var_ctx.split_varnames())):
+                    slice_varnames.append("{}.block{}".format(reshaped_varname,
+                                                              i))
+                    remote_varnames.append(reshaped_varname)
+
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes": slice_shapes,
+                        "slice_varnames": slice_varnames,
+                        "remote_varnames": remote_varnames,
+                        "is_sparse": True,
+                        "endpoints": var_ctx.split_endpoints(),
+                        "pserver_num":
+                        len(self.role_maker.get_pserver_endpoints()),
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+
+            for origin_varname in origin_varnames:
+                var = self.origin_main_program.global_block().vars[
+                    origin_varname]
+
+                block.append_op(
+                    type='recv_save',
+                    attrs={
+                        "trainer_id": self.role_maker.worker_index(),
+                        "shape": var.shape,
+                        "slice_shapes":
+                        [",".join([str(i) for i in var.shape])],
+                        "slice_varnames": [origin_varname],
+                        "remote_varnames": [origin_varname],
+                        "is_sparse": False,
+                        "endpoints": var_ctx.split_endpoints()[:1],
+                        "file_path": os.path.join(dirname, var.name)
+                    })
+        executor.run(prog)
+        return context.keys()
+
+    def _save_distributed_params(self, executor, dirname, context,
+                                 main_program):
+        prog = Program()
+        block = prog.global_block()
+
+        for name, var_ctx in context.items():
+            block.append_op(
+                type='checkpoint_notify',
+                attrs={
+                    "varname": name,
+                    "is_slice": True,
+                    "slice_varnames": var_ctx.split_varnames(),
+                    "remote_varnames": var_ctx.split_varnames(),
+                    "endpoints": var_ctx.split_endpoints(),
+                    "dirname": dirname
+                })
+
+        executor.run(prog)
+        return context.keys()
+
+    def _save_distributed_persistables(self, executor, dirname, main_program):
+        dense_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=1)
+
+        sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=2)
+
+        distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
+            recv_type=3)
+
+        recv_dense_varnames = self._save_dense_params(executor, dirname,
+                                                      dense_ctx, main_program)
+
+        recv_sparse_varnames = self._save_sparse_params(
+            executor, dirname, sparse_ctx, main_program)
+
+        recv_distributed_varnames = self._save_distributed_params(
+            executor, dirname, distributed_ctx, main_program)
+
+        saved_varnames = recv_dense_varnames + list(
+            recv_sparse_varnames) + list(recv_distributed_varnames)
+
+        remaining_vars = list(
+            filter(
+                ParameterServerRuntime.__exclude_vars(saved_varnames),
+                main_program.list_vars()))
+
+        fluid.io.save_vars(
+            executor,
+            main_program=main_program,
+            dirname=dirname,
+            vars=remaining_vars)
+
+    def _ps_inference_save_persistables(self,
+                                        executor,
+                                        dirname,
+                                        main_program=None,
+                                        **kwargs):
+        """
+        This function filters out all variables with `persistable==True` from the
+        give `main_program` and then saves these variables to the folder `dirname`
+        or file `filename`.
+
+        The `dirname` is used to specify the folder where persistable variables
+        are going to be saved. If you would like to save variables in separate
+        files, set `filename` None; if you would like to save all variables in a
+        single file, use `filename` to specify the file name.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type"
+            )
+
+        if main_program is None:
+            main_program = fluid.default_main_program()
+
+        if isinstance(main_program, CompiledProgram):
+            raise TypeError(
+                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
+            )
+
+        self._save_distributed_persistables(executor, dirname, main_program)
+
+    def _ps_inference_save_inference_model(self,
+                                           executor,
+                                           dirname,
+                                           feeded_var_names,
+                                           target_vars,
+                                           main_program=None,
+                                           export_for_deployment=True):
+        """
+        Prune the given `main_program` to build a new program especially for inference,
+        and then save it and all related parameters to given `dirname` by the `executor`.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type"
+            )
+
+        if main_program is not None:
+            if isinstance(main_program, CompiledProgram):
+                raise TypeError(
+                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
+                )
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor, main_program,
+                                          None, None, export_for_deployment)
+        else:
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor,
+                                          self.origin_main_program, None, None,
+                                          export_for_deployment, True)
+
+            model_basename = "__model__"
+            model_filename = os.path.join(dirname, model_basename)
+
+            with open(model_filename, "rb") as f:
+                program_desc_str = f.read()
+
+            program = Program.parse_from_string(program_desc_str)
+            program._copy_dist_param_info_from(fluid.default_main_program())
+            self._ps_inference_save_persistables(executor, dirname, program)
+
+    def _save_inference_model(self, *args, **kwargs):
+        self._ps_inference_save_inference_model(*args, **kwargs)
+
+    def _save_persistables(self, *args, **kwargs):
+        self._ps_inference_save_persistables(*args, **kwargs)
diff --git a/python/paddle/fleet/runtime/runtime_base.py b/python/paddle/distributed/fleet/runtime/runtime_base.py
similarity index 83%
rename from python/paddle/fleet/runtime/runtime_base.py
rename to python/paddle/distributed/fleet/runtime/runtime_base.py
index c7ce8b5a2914bf30f346cbd0777d1d233ddf5e1b..2e8bacfbc3b1ded58e63e8d9e93764a0c0090b91 100644
--- a/python/paddle/fleet/runtime/runtime_base.py
+++ b/python/paddle/distributed/fleet/runtime/runtime_base.py
@@ -25,7 +25,7 @@ class RuntimeBase(object):
     def _run_worker(self):
         pass
 
-    def _init_server(self):
+    def _init_server(self, *args, **kwargs):
         pass
 
     def _run_server(self):
@@ -33,3 +33,9 @@ class RuntimeBase(object):
 
     def _stop_worker(self):
         pass
+
+    def _save_inference_model(self, *args, **kwargs):
+        pass
+
+    def _save_persistables(self, *args, **kwargs):
+        pass
diff --git a/python/paddle/nn/input.py b/python/paddle/distributed/fleet/utils/__init__.py
similarity index 72%
rename from python/paddle/nn/input.py
rename to python/paddle/distributed/fleet/utils/__init__.py
index b5f591f44a9a167dcba8e4e46322ca157a5e48cb..212308159aabb123fde11543b3482f2232b4925d 100644
--- a/python/paddle/nn/input.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define input placeholders of neural network  
-from ..fluid import data  #DEFINE_ALIAS
+from .fs import *
+from .http_server import KVHandler, KVHTTPServer, KVServer
 
-__all__ = [
-    'data',
-    #       'Input'
-]
+__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dbe5cefbb4944e219989358ebeb0c321f942551
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -0,0 +1,515 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import subprocess
+import multiprocessing
+from datetime import datetime
+
+import re
+import copy
+import errno
+import time
+import logging
+import six
+import abc
+import paddle.fluid as fluid
+from paddle.fluid import core
+import functools
+
+from pathlib import PurePosixPath, Path
+import shutil
+
+__all__ = [
+    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
+    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
+]
+
+
+class ExecuteError(Exception):
+    pass
+
+
+class FSFileExistsError(Exception):
+    pass
+
+
+class FSFileNotExistsError(Exception):
+    pass
+
+
+class FSTimeOut(Exception):
+    pass
+
+
+class FSShellCmdAborted(ExecuteError):
+    pass
+
+
+class FS(object):
+    @abc.abstractmethod
+    def ls_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_file(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_exist(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload(self, local_path, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def download(self, fs_path, local_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mkdirs(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def delete(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def need_upload_download(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def rename(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload_dir(self, local_dir, dest_dir):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def list_dirs(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def touch(self, fs_path, exist_ok=True):
+        raise NotImplementedError
+
+
+class LocalFS(FS):
+    def ls_dir(self, fs_path):
+        if not self.is_exist(fs_path):
+            return [], []
+
+        dirs = []
+        files = []
+        for f in os.listdir(fs_path):
+            if os.path.isdir(fs_path + "/" + f):
+                dirs.append(f)
+            else:
+                files.append(f)
+
+        return dirs, files
+
+    def mkdirs(self, fs_path):
+        assert not os.path.isfile(fs_path), "{} is already a file".format(
+            fs_path)
+        os.system("mkdir -p {}".format(fs_path))
+
+    def rename(self, fs_src_path, fs_dst_path):
+        os.rename(fs_src_path, fs_dst_path)
+
+    def _rmr(self, fs_path):
+        shutil.rmtree(fs_path)
+
+    def _rm(self, fs_path):
+        os.remove(fs_path)
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        if os.path.isfile(fs_path):
+            return self._rm(fs_path)
+
+        return self._rmr(fs_path)
+
+    def need_upload_download(self):
+        return False
+
+    def is_file(self, fs_path):
+        return os.path.isfile(fs_path)
+
+    def is_dir(self, fs_path):
+        return os.path.isdir(fs_path)
+
+    def is_exist(self, fs_path):
+        return os.path.exists(fs_path)
+
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return Path(fs_path).touch(exist_ok=True)
+
+    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
+        if not self.is_exist(src_path):
+            raise FSFileNotExistsError
+
+        if overwrite and self.is_exist(dst_path):
+            self.delete(dst_path)
+
+        if self.is_exist(dst_path):
+            raise FSFileExistsError
+
+        return self.rename(src_path, dst_path)
+
+    def list_dirs(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        if not self.is_exist(fs_path):
+            return []
+
+        dirs = [
+            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
+        ]
+
+        return dirs
+
+
+"""HDFS Utils."""
+
+
+def _handle_errors(f):
+    def handler(*args, **kwargs):
+        start = time.time()
+        while True:
+            try:
+                return f(*args, **kwargs)
+            except ExecuteError as e:
+                o = args[0]
+                time_out = float(o._time_out) / 1000.0
+                inter = float(o._sleep_inter) / 1000.0
+                if time.time() - start >= time_out:
+                    raise FSTimeOut
+                time.sleep(inter)
+
+    return functools.wraps(f)(handler)
+
+
+def _handle_errors(max_time_out=None):
+    def decorator(f):
+        @functools.wraps(f)
+        def handler(*args, **kwargs):
+            o = args[0]
+            time_out = max_time_out
+            if time_out is None:
+                time_out = float(o._time_out) / 1000.0
+            else:
+                time_out /= 1000.0
+            inter = float(o._sleep_inter) / 1000.0
+
+            start = time.time()
+            last_print_time = start
+            while True:
+                try:
+                    return f(*args, **kwargs)
+                #important: only ExecuteError need to retry
+                except ExecuteError as e:
+                    if time.time() - start >= time_out:
+                        raise FSTimeOut("args:{} timeout:{}".format(
+                            args, time.time() - start))
+
+                    time.sleep(inter)
+
+                if time.time() - last_print_time > 30:
+                    print("hadoop operator timeout:args:{} timeout:{}".format(
+                        args, time.time() - start))
+                    last_print_time = time.time()
+
+        return handler
+
+    return decorator
+
+
+class HDFSClient(FS):
+    def __init__(
+            self,
+            hadoop_home,
+            configs,
+            time_out=5 * 60 * 1000,  #ms
+            sleep_inter=1000):  #ms
+        # Raise exception if JAVA_HOME not exists.
+        java_home = os.environ["JAVA_HOME"]
+
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+
+        if configs:
+            for k, v in six.iteritems(configs):
+                config_command = '-D%s=%s' % (k, v)
+                self.pre_commands.append(config_command)
+
+        self._time_out = time_out
+        self._sleep_inter = sleep_inter
+        self._base_cmd = " ".join(self.pre_commands)
+        self._bd_err_re = re.compile(
+            r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
+
+    def _run_cmd(self, cmd, redirect_stderr=False):
+        exe_cmd = "{} -{}".format(self._base_cmd, cmd)
+        ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr)
+        ret = int(ret)
+        if ret == 134:
+            raise FSShellCmdAborted(cmd)
+        return ret, output.splitlines()
+
+    @_handle_errors()
+    def list_dirs(self, fs_path):
+        if not self.is_exist(fs_path):
+            return []
+
+        dirs, files = self._ls_dir(fs_path)
+        return dirs
+
+    @_handle_errors()
+    def ls_dir(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        if not self.is_exist(fs_path):
+            return [], []
+
+        return self._ls_dir(fs_path)
+
+    def _ls_dir(self, fs_path):
+        cmd = "ls {}".format(fs_path)
+        ret, lines = self._run_cmd(cmd)
+
+        if ret != 0:
+            raise ExecuteError(cmd)
+
+        dirs = []
+        files = []
+        for line in lines:
+            arr = line.split()
+            if len(arr) != 8:
+                continue
+
+            p = PurePosixPath(arr[7])
+            if arr[0][0] == 'd':
+                dirs.append(p.name)
+            else:
+                files.append(p.name)
+
+        return dirs, files
+
+    def _test_match(self, lines):
+        for l in lines:
+            m = self._bd_err_re.match(l)
+            if m != None:
+                return m
+
+        return None
+
+    @_handle_errors()
+    def is_dir(self, fs_path):
+        if not self.is_exist(fs_path):
+            return False
+
+        return self._is_dir(fs_path)
+
+    def _is_dir(self, fs_path):
+        cmd = "test -d {}".format(fs_path, redirect_stderr=True)
+        ret, lines = self._run_cmd(cmd)
+        if ret:
+            # other error
+            if self._test_match(lines):
+                raise ExecuteError(cmd)
+
+            return False
+
+        return True
+
+    def is_file(self, fs_path):
+        if not self.is_exist(fs_path):
+            return False
+
+        return not self._is_dir(fs_path)
+
+    @_handle_errors()
+    def is_exist(self, fs_path):
+        cmd = "ls {} ".format(fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
+        if ret != 0:
+            for l in out:
+                if "No such file or directory" in l:
+                    return False
+            raise ExecuteError(cmd)
+
+        return True
+
+    # can't retry
+    def upload(self, local_path, fs_path):
+        if self.is_exist(fs_path):
+            raise FSFileExistsError("{} exists".format(fs_path))
+
+        local = LocalFS()
+        if not local.is_exist(local_path):
+            raise FSFileNotExistsError("{} not exists".format(local_path))
+
+        return self._try_upload(local_path, fs_path)
+
+    @_handle_errors()
+    def _try_upload(self, local_path, fs_path):
+        cmd = "put {} {}".format(local_path, fs_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            self.delete(fs_path)
+            raise e
+
+    # can't retry
+    def download(self, fs_path, local_path):
+        if self.is_exist(local_path):
+            raise FSFileExistsError("{} exists".format(local_path))
+
+        if not self.is_exist(fs_path):
+            raise FSFileNotExistsError("{} not exits".format(fs_path))
+
+        return self._try_download(fs_path, local_path)
+
+    @_handle_errors()
+    def _try_download(self, fs_path, local_path):
+        cmd = "get {} {}".format(fs_path, local_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            local_fs = LocalFS()
+            local_fs.delete(local_path)
+            raise e
+
+    @_handle_errors()
+    def mkdirs(self, fs_path):
+        if self.is_exist(fs_path):
+            return
+
+        out_hdfs = False
+
+        cmd = "mkdir {} ".format(fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
+        if ret != 0:
+            for l in out:
+                if "No such file or directory" in l:
+                    out_hdfs = True
+                    break
+            if not out_hdfs:
+                raise ExecuteError(cmd)
+
+        if out_hdfs and not self.is_exist(fs_path):
+            cmd = "mkdir -p {}".format(fs_path)
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        if overwrite and self.is_exist(fs_dst_path):
+            self.delete(fs_dst_path)
+
+        if test_exists:
+            if not self.is_exist(fs_src_path):
+                raise FSFileNotExistsError("{} is not exists".format(
+                    fs_src_path))
+
+            if self.is_exist(fs_dst_path):
+                raise FSFileExistsError("{} exists already".format(
+                    fs_src_path, fs_dst_path, fs_dst_path))
+
+        return self._try_mv(fs_src_path, fs_dst_path)
+
+    @_handle_errors()
+    def _try_mv(self, fs_src_path, fs_dst_path):
+        cmd = "mv {} {}".format(fs_src_path, fs_dst_path)
+        ret = 0
+        try:
+            ret, _ = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            if not self.is_exist(fs_src_path) and \
+                    self.is_exist(fs_dst_path):
+                return
+            raise e
+
+    def _rmr(self, fs_path):
+        cmd = "rmr {}".format(fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError(cmd)
+
+    def _rm(self, fs_path):
+        cmd = "rm {}".format(fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError(cmd)
+
+    @_handle_errors()
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        is_dir = self._is_dir(fs_path)
+        if is_dir:
+            return self._rmr(fs_path)
+
+        return self._rm(fs_path)
+
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return self._touchz(fs_path)
+
+    @_handle_errors()
+    def _touchz(self, fs_path):
+        cmd = "touchz {}".format(fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    def need_upload_download(self):
+        return True
diff --git a/python/paddle/distributed/fleet/utils/http_server.py b/python/paddle/distributed/fleet/utils/http_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e310b0a5a516aaaebe6f35822243c56e2ba905
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/http_server.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Http Server."""
+
+import logging
+
+import six
+# NOTE: HTTPServer has a different name in python2 and python3
+if six.PY2:
+    from BaseHTTPServer import HTTPServer
+    import SimpleHTTPServer
+else:
+    from http.server import HTTPServer
+    import http.server as SimpleHTTPServer
+
+import time
+import threading
+import socket
+
+
+def get_logger(name, level, fmt):
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    handler = logging.FileHandler('http.log', mode='w')
+    formatter = logging.Formatter(fmt=fmt)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+
+
+_http_server_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    """
+    kv handler class for kv http server,
+    it defines the way to get/set kv in server.
+    """
+
+    def do_GET(self):
+        """
+        get method for kv handler, get value according to key.
+        """
+        log_str = "GET " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        with self.server.kv_lock:
+            value = self.server.kv.get(scope, {}).get(key)
+        if value is None:
+            log_str += ' , key not found: ' + key
+            self.send_status_code(404)
+        else:
+            log_str += ' , key found: ' + key
+            self.send_response(200)
+            self.send_header("Content-Length", str(len(value)))
+            self.end_headers()
+            self.wfile.write(value)
+        _http_server_logger.info(log_str)
+
+    def do_PUT(self):
+        """
+        put method for kv handler, set value according to key.
+        """
+        log_str = "PUT " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        content_length = int(self.headers['Content-Length'])
+        try:
+            value = self.rfile.read(content_length)
+        except:
+            print("receive error invalid request")
+            self.send_status_code(404)
+            return
+        with self.server.kv_lock:
+            if self.server.kv.get(scope) is None:
+                self.server.kv[scope] = {}
+            self.server.kv[scope][key] = value
+        self.send_status_code(200)
+        _http_server_logger.info(log_str)
+
+    def do_DELETE(self):
+        """
+        delete method for kv handler, set value according to key.
+        """
+        log_str = "DELETE " + self.address_string() + self.path
+        paths = self.path.split('/')
+        if len(paths) < 3:
+            print('len of request path must be 3: ' + self.path)
+            self.send_status_code(400)
+            return
+        _, scope, key = paths
+        with self.server.delete_kv_lock:
+            if self.server.delete_kv.get(scope) is None:
+                self.server.delete_kv[scope] = []
+            self.server.delete_kv[scope].append(key)
+        self.send_status_code(200)
+        _http_server_logger.info(log_str)
+
+    def log_message(self, format, *args):
+        """
+        ignore all logging messages in kv handler.
+        """
+        pass
+
+    def send_status_code(self, code):
+        """
+        send status code back to client.
+        """
+        self.send_response(code)
+        self.send_header("Content-Length", 0)
+        self.end_headers()
+
+
+class KVHTTPServer(HTTPServer, object):
+    """
+    it is a http server storing kv pairs.
+    """
+
+    def __init__(self, port, handler):
+        """Init."""
+        super(KVHTTPServer, self).__init__(('', port), handler)
+        self.delete_kv_lock = threading.Lock()
+        self.delete_kv = {}
+        self.kv_lock = threading.Lock()
+        self.kv = {}
+
+    def get_deleted_size(self, key):
+        """
+        get deleted size in key.
+        """
+        ret = 0
+        with self.delete_kv_lock:
+            ret = self.delete_kv.get(key, 0)
+        return ret
+
+
+class KVServer:
+    """
+    it is a server storing kv pairs, has a http server inside.
+    """
+
+    def __init__(self, port, size={}):
+        """Init."""
+        self.http_server = KVHTTPServer(port, KVHandler)
+        self.listen_thread = None
+        self.size = {}
+
+    def start(self):
+        """
+        start server until user calls stop to let it quit.
+        """
+        self.listen_thread = threading.Thread(
+            target=lambda: self.http_server.serve_forever())
+        self.listen_thread.start()
+
+    def stop(self):
+        """
+        stop server and clear its resources.
+        """
+        self.http_server.shutdown()
+        self.listen_thread.join()
+        self.http_server.server_close()
+
+    def shoud_stop(self):
+        """
+        return whether the server should stop.
+
+        Returns:
+            ret(bool): whether the server should stop
+        """
+        for key in self.size:
+            s = self.http_server.get_deleted_size(key)
+            if s != self.size.get(key, 0):
+                return False
+        return True
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index fff10c5b2a9ee497cccff94346314db2c8011eb5..acb7251a15dbfff7e032079c2be2e973e81aa5f4 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -18,3 +18,525 @@
 #            'Normal',
 #            'sampling_id',
 #            'Uniform']
+
+from __future__ import print_function
+
+from .fluid.layers import control_flow
+from .fluid.layers import tensor
+from .fluid.layers import ops
+from .fluid.layers import nn
+from .fluid import core
+from .fluid.framework import in_dygraph_mode
+from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
+import math
+import numpy as np
+import warnings
+
+from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+
+__all__ = ['Distribution', 'Uniform', 'Normal']
+
+
+class Distribution(object):
+    """
+    The abstract base class for probability distributions. Functions are 
+    implemented in specific distributions.
+    """
+
+    def __init__(self):
+        super(Distribution, self).__init__()
+
+    def sample(self):
+        """Sampling from the distribution."""
+        raise NotImplementedError
+
+    def entropy(self):
+        """The entropy of the distribution."""
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        """The KL-divergence between self distributions and other."""
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        """Log probability density/mass function."""
+        raise NotImplementedError
+
+    def probs(self, value):
+        """Probability density/mass function."""
+        raise NotImplementedError
+
+    def _validate_args(self, *args):
+        """
+        Argument validation for distribution args
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Raises
+            ValueError: if one argument is Tensor, all arguments should be Tensor
+        """
+        is_variable = False
+        is_number = False
+        for arg in args:
+            if isinstance(arg, tensor.Variable):
+                is_variable = True
+            else:
+                is_number = True
+
+        if is_variable and is_number:
+            raise ValueError(
+                'if one argument is Tensor, all arguments should be Tensor')
+
+        return is_variable
+
+    def _to_tensor(self, *args):
+        """
+        Argument convert args to Tensor
+
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Returns:
+            Tensor of args.
+        """
+        numpy_args = []
+        variable_args = []
+        tmp = 0.
+
+        for arg in args:
+            valid_arg = False
+            for cls in [float, list, np.ndarray, tensor.Variable]:
+                if isinstance(arg, cls):
+                    valid_arg = True
+                    break
+            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Tensor."
+            if isinstance(arg, float):
+                arg = np.zeros(1) + arg
+            arg_np = np.array(arg)
+            arg_dtype = arg_np.dtype
+            if str(arg_dtype) not in ['float32']:
+                warnings.warn(
+                    "data type of argument only support float32, your argument will be convert to float32."
+                )
+                arg_np = arg_np.astype('float32')
+            tmp = tmp + arg_np
+            numpy_args.append(arg_np)
+
+        dtype = tmp.dtype
+        for arg in numpy_args:
+            arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
+            arg_variable = tensor.create_tensor(dtype=dtype)
+            tensor.assign(arg_broadcasted, arg_variable)
+            variable_args.append(arg_variable)
+
+        return tuple(variable_args)
+
+
+class Uniform(Distribution):
+    """Uniform distribution with `low` and `high` parameters.
+
+    Mathematical Details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+
+    .. math::
+
+        Z = b - a
+
+    In the above equation:
+
+    * :math:`low = a`,
+    * :math:`high = b`,
+    * :math:`Z`: is the normalizing constant.
+
+    The parameters `low` and `high` must be shaped in a way that supports
+    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+
+    Args:
+        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          from paddle.distribution import Uniform
+
+          paddle.disable_static()
+          # Without broadcasting, a single uniform distribution [3, 4]:
+          u1 = Uniform(low=3.0, high=4.0)
+          # 2 distributions [1, 3], [2, 4]
+          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
+          # 4 distributions
+          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
+                    high=[[1.5, 2.5], [3.5, 4.5]])
+
+          # With broadcasting:
+          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
+
+          # Complete example
+          value_npdata = np.array([0.8], dtype="float32")
+          value_tensor = paddle.to_tensor(value_npdata)
+
+          uniform = Uniform([0.], [2.])
+
+          sample = uniform.sample([2])
+          # a random tensor created by uniform distribution with shape: [2, 1]
+          entropy = uniform.entropy()
+          # [0.6931472] with shape: [1]
+          lp = uniform.log_prob(value_tensor)
+          # [-0.6931472] with shape: [1]
+          p = uniform.probs(value_tensor)
+          # [0.5] with shape: [1]
+    """
+
+    def __init__(self, low, high, name=None):
+        if not in_dygraph_mode():
+            check_type(low, 'low',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Uniform')
+            check_type(high, 'high',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Uniform')
+
+        self.all_arg_is_float = False
+        self.batch_size_unknown = False
+        self.name = name if name is not None else 'Uniform'
+
+        if isinstance(low, int):
+            low = float(low)
+        if isinstance(high, int):
+            high = float(high)
+
+        if self._validate_args(low, high):
+            self.batch_size_unknown = True
+            self.low = low
+            self.high = high
+        else:
+            if isinstance(low, float) and isinstance(high, float):
+                self.all_arg_is_float = True
+            self.low, self.high = self._to_tensor(low, high)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        name = self.name + '_sample'
+        batch_shape = list((self.low + self.high).shape)
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
+            uniform_random_tmp = nn.uniform_random_batch_size_like(
+                zero_tmp, zero_tmp.shape, min=0., max=1., seed=seed)
+            output = uniform_random_tmp * (zero_tmp + self.high - self.low
+                                           ) + self.low
+            return nn.reshape(output, output_shape, name=name)
+        else:
+            output_shape = shape + batch_shape
+            output = nn.uniform_random(
+                output_shape, seed=seed) * (tensor.zeros(
+                    output_shape, dtype=self.low.dtype) +
+                                            (self.high - self.low))
+            output = elementwise_add(output, self.low, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        name = self.name + '_log_prob'
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            dtype = value.dtype
+            lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                               dtype)
+            ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                               dtype)
+            return nn.log(lb * ub) - nn.log(self.high - self.low)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+
+        lb_bool = control_flow.less_than(self.low, value)
+        ub_bool = control_flow.less_than(value, self.high)
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_sub(
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        name = self.name + '_probs'
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            dtype = value.dtype
+            lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                               dtype)
+            ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                               dtype)
+            return (lb * ub) / (self.high - self.low)
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+
+        lb_bool = control_flow.less_than(self.low, value)
+        ub_bool = control_flow.less_than(value, self.high)
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_div((lb * ub), (self.high - self.low), name=name)
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        return nn.log(self.high - self.low, name=name)
+
+
+class Normal(Distribution):
+    """The Normal distribution with location `loc` and `scale` parameters.
+
+    Mathematical details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+
+    .. math::
+
+        Z = (2 \pi \sigma^2)^{0.5}
+
+    In the above equation:
+
+    * :math:`loc = \mu`: is the mean.
+    * :math:`scale = \sigma`: is the std.
+    * :math:`Z`: is the normalization constant.
+
+    Args:
+        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+          
+          import numpy as np
+          import paddle
+          from paddle.distribution import Normal
+
+          paddle.disable_static()
+          # Define a single scalar Normal distribution.
+          dist = Normal(loc=0., scale=3.)
+          # Define a batch of two scalar valued Normals.
+          # The first has mean 1 and standard deviation 11, the second 2 and 22.
+          dist = Normal(loc=[1., 2.], scale=[11., 22.])
+          # Get 3 samples, returning a 3 x 2 tensor.
+          dist.sample([3])
+
+          # Define a batch of two scalar valued Normals.
+          # Both have mean 1, but different standard deviations.
+          dist = Normal(loc=1., scale=[11., 22.])
+
+          # Complete example
+          value_npdata = np.array([0.8], dtype="float32")
+          value_tensor = paddle.to_tensor(value_npdata)
+
+          normal_a = Normal([0.], [1.])
+          normal_b = Normal([0.5], [2.])
+          sample = normal_a.sample([2])
+          # a random tensor created by normal distribution with shape: [2, 1]
+          entropy = normal_a.entropy()
+          # [1.4189385] with shape: [1]
+          lp = normal_a.log_prob(value_tensor)
+          # [-1.2389386] with shape: [1]
+          p = normal_a.probs(value_tensor)
+          # [0.28969154] with shape: [1]
+          kl = normal_a.kl_divergence(normal_b)
+          # [0.34939718] with shape: [1]
+    """
+
+    def __init__(self, loc, scale, name=None):
+        if not in_dygraph_mode():
+            check_type(loc, 'loc',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Normal')
+            check_type(scale, 'scale',
+                       (int, float, np.ndarray, tensor.Variable, list),
+                       'Normal')
+
+        self.batch_size_unknown = False
+        self.all_arg_is_float = False
+        self.name = name if name is not None else 'Normal'
+
+        if isinstance(loc, int):
+            loc = float(loc)
+        if isinstance(scale, int):
+            scale = float(scale)
+
+        if self._validate_args(loc, scale):
+            self.batch_size_unknown = True
+            self.loc = loc
+            self.scale = scale
+        else:
+            if isinstance(loc, float) and isinstance(scale, float):
+                self.all_arg_is_float = True
+            self.loc, self.scale = self._to_tensor(loc, scale)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        batch_shape = list((self.loc + self.scale).shape)
+        name = self.name + '_sample'
+
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
+            zero_tmp_shape = nn.shape(zero_tmp)
+            normal_random_tmp = nn.gaussian_random(
+                zero_tmp_shape, mean=0., std=1., seed=seed)
+            output = normal_random_tmp * (zero_tmp + self.scale) + self.loc
+            return nn.reshape(output, output_shape, name=name)
+        else:
+            output_shape = shape + batch_shape
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
+                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Tensor: Shannon entropy of normal distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        batch_shape = list((self.loc + self.scale).shape)
+        zero_tmp = tensor.fill_constant_batch_size_like(
+            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+        return elementwise_add(
+            0.5 + zero_tmp,
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
+            name=name)
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        if not in_dygraph_mode():
+            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                     'log_prob')
+
+        name = self.name + '_log_prob'
+        var = self.scale * self.scale
+        log_scale = nn.log(self.scale)
+        return elementwise_sub(
+            -1. * ((value - self.loc) * (value - self.loc)) / (2. * var),
+            log_scale + math.log(math.sqrt(2. * math.pi)),
+            name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        if not in_dygraph_mode():
+            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                     'log_prob')
+
+        name = self.name + '_probs'
+        var = self.scale * self.scale
+        return elementwise_div(
+            ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                    (2. * var)), (math.sqrt(2 * math.pi) * self.scale),
+            name=name)
+
+    def kl_divergence(self, other):
+        """The KL-divergence between two normal distributions.
+
+        Args:
+            other (Normal): instance of Normal.
+
+        Returns:
+            Tensor: kl-divergence between two normal distributions.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(other, 'other', Normal, 'kl_divergence')
+
+        name = self.name + '_kl_divergence'
+        var_ratio = self.scale / other.scale
+        var_ratio = (var_ratio * var_ratio)
+        t1 = (self.loc - other.loc) / other.scale
+        t1 = (t1 * t1)
+        return elementwise_add(
+            0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
diff --git a/python/paddle/fleet/base/util_factory.py b/python/paddle/fleet/base/util_factory.py
deleted file mode 100644
index 385500de8c018853fe46205fc3d5bc6aac1aa22d..0000000000000000000000000000000000000000
--- a/python/paddle/fleet/base/util_factory.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fleet Utils."""
-"""distributed operations"""
-"""basic collective operations in python"""
-"""remote file system"""
-
-__all__ = ['UtilBase']
-
-
-class UtilFactory(object):
-    def _create_util(self, context):
-        util = UtilBase()
-        util._set_strategy(context["valid_strategy"])
-        util._set_role_maker(context["role_maker"])
-        return util
-
-
-class UtilBase(object):
-    def __init__(self):
-        self.role_maker = None
-        self.dist_strategy = None
-
-    def _set_strategy(self, dist_strategy):
-        self.dist_strategy = dist_strategy
-
-    def _set_role_maker(self, role_maker):
-        self.role_maker = role_maker
-
-    '''
-    def set_file_system(self, fs_client):
-        self.fs_client = fs_client
-
-    def broadcast(self):
-        pass
-
-    def all_gather(self):
-        pass
-
-    def all_reduce(self):
-        pass
-
-    def reduce_scatter(self):
-        pass
-
-    def reduce(self):
-        pass
-
-    def get_file_shard(self, files):
-        pass
-
-    def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
-        pass
-
-    def save_program(program, output_dir):
-        pass
-
-    def load_program(input_dir):
-        pass
-
-    def load_var():
-        pass
-
-    def save_var():
-        pass
-
-    def print_on_rank(self):
-        pass
-    '''
diff --git a/python/paddle/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/fleet/meta_optimizers/pipeline_optimizer.py
deleted file mode 100644
index 9fd919f30f688d1b12fac258c2d6c9dc47fbf049..0000000000000000000000000000000000000000
--- a/python/paddle/fleet/meta_optimizers/pipeline_optimizer.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-from paddle.fluid.optimizer import PipelineOptimizer as PO
-from .meta_optimizer_base import MetaOptimizerBase
-
-__all__ = ["PipelineOptimizer"]
-
-
-class PipelineOptimizer(MetaOptimizerBase):
-    def __init__(self, optimizer):
-        super(PipelineOptimizer, self).__init__(optimizer)
-        self.inner_opt = optimizer
-        # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
-
-    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
-                        user_defined_strategy):
-        super(PipelineOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        num_microbatches = user_defined_strategy.pipeline_configs['micro_batch']
-        self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
-
-    def _can_apply(self):
-        if self.user_defined_strategy.pipeline == True:
-            return True
-        return False
-
-    def _disable_strategy(self, dist_strategy):
-        dist_strategy.pipeline = False
-        dist_strategy.pipeline_configs = {"micro_batch": 1}
-
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None,
-                 callbacks=None):
-        return self.wrapped_opt.backward(loss, startup_program, parameter_list,
-                                         no_grad_set, callbacks)
-
-    def minimize_impl(self,
-                      loss,
-                      startup_program=None,
-                      parameter_list=None,
-                      no_grad_set=None):
-        optimize_ops, params_grads, prog_list = \
-            self.wrapped_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
-        return optimize_ops, params_grads
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 776a52b300fe0c7c582b59947e13e5ca98daf4e4..2ed8642c86d95bf049920d281f4063da9779623e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,7 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -89,6 +89,7 @@ from .dygraph.base import enable_dygraph, disable_dygraph
 from .io import save, load, load_program_state, set_program_state
 from .dygraph.checkpoint import save_dygraph, load_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
+from . import generator
 Tensor = LoDTensor
 enable_imperative = enable_dygraph
 disable_imperative = disable_dygraph
@@ -96,7 +97,7 @@ disable_imperative = disable_dygraph
 __all__ = framework.__all__ + executor.__all__ + \
     trainer_desc.__all__ + transpiler.__all__ + \
     parallel_executor.__all__ + lod_tensor.__all__ + \
-    data_feed_desc.__all__ + compiler.__all__ + backward.__all__  + [
+    data_feed_desc.__all__ + compiler.__all__ + backward.__all__  + generator.__all__ + [
         'io',
         'initializer',
         'embedding',
@@ -118,6 +119,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'LoDTensor',
         'LoDTensorArray',
         'CPUPlace',
+        'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
         'Tensor',
@@ -166,17 +168,34 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
     sysstr = platform.system()
     read_env_flags = [
-        'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
-        'eager_delete_scope', 'fraction_of_cpu_memory_to_use',
-        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads',
-        'dist_threadpool_size', 'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
-        'enable_parallel_graph', 'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
-        'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit'
+        'check_nan_inf',
+        'fast_check_nan_inf',
+        'benchmark',
+        'eager_delete_scope',
+        'fraction_of_cpu_memory_to_use',
+        'initial_cpu_memory_in_mb',
+        'init_allocated_mem',
+        'paddle_num_threads',
+        'dist_threadpool_size',
+        'eager_delete_tensor_gb',
+        'fast_eager_deletion_mode',
+        'memory_fraction_of_eager_deletion',
+        'allocator_strategy',
+        'reader_queue_speed_test_mode',
+        'print_sub_graph_dir',
+        'pe_profile_fname',
+        'inner_op_parallelism',
+        'enable_parallel_graph',
+        'fuse_parameter_groups_size',
+        'multiple_of_cupti_buffer_size',
+        'fuse_parameter_memory_size',
+        'tracer_profile_fname',
+        'dygraph_debug',
+        'use_system_allocator',
+        'enable_unused_var_check',
+        'free_idle_chunk',
+        'free_when_no_cache_hit',
+        'call_stack_level',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -208,12 +227,19 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
-            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
-            'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time',
-            'local_exe_sub_scope_limit', 'gpu_memory_limit_mb'
+            'fraction_of_gpu_memory_to_use',
+            'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb',
+            'cudnn_deterministic',
+            'enable_cublas_tensor_op_math',
+            'conv_workspace_size_limit',
+            'cudnn_exhaustive_search',
+            'selected_gpus',
+            'sync_nccl_allreduce',
+            'cudnn_batchnorm_spatial_persistent',
+            'gpu_allocator_retry_time',
+            'local_exe_sub_scope_limit',
+            'gpu_memory_limit_mb',
         ]
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 7b301ac19d1d3dc1f4aabb6cf3af2f0874faa677..5f6594a47213021c3a82dd4a0266f52240270e87 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -129,7 +129,7 @@ class GradientClipBase(object):
     def __str__(self):
         raise NotImplementedError()
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         raise NotImplementedError
 
@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         sum_square_list = []
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 3097e1d82a9cb5e096efa3913ea6a06bff557c94..244a621611060b87805846f1ea748615bcdde19a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -29,6 +29,7 @@ from .quantization_pass import _out_scale_op_list
 from .quantization_pass import _get_op_input_var_names
 from .quantization_pass import _get_op_output_var_names
 from .quantization_pass import _get_output_name_index
+from .quantization_pass import _channelwise_quant_axis1_ops
 
 __all__ = ['PostTrainingQuantization', 'WeightQuantization']
 
@@ -316,6 +317,7 @@ class PostTrainingQuantization(object):
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
+        self.weight_op_pairs = {}
         self._sampling_data = {}
         self._quantized_var_kl_threshold = {}
         self._quantized_var_min = {}
@@ -436,6 +438,8 @@ class PostTrainingQuantization(object):
         graph = IrGraph(core.Graph(self._program.desc), for_test=True)
         graph = _remove_ctrl_vars(graph)
         graph = _apply_pass(self._scope, graph, 'conv_bn_fuse_pass')
+        graph = _apply_pass(self._scope, graph, 'depthwise_conv_bn_fuse_pass')
+        graph = _apply_pass(self._scope, graph, 'conv_transpose_bn_fuse_pass')
         self._program = graph.to_program()
 
     def _collect_target_varnames(self):
@@ -446,10 +450,11 @@ class PostTrainingQuantization(object):
         # TODO(juncaipeng), consider the name_scope of skip_quant
         _logger.info("Collect quantized variable names ...")
 
-        def collect_var_name(var_name_list, persistable_var_names):
+        def collect_var_name(var_name_list, persistable_var_names, op_type):
             for var_name in var_name_list:
                 if var_name in persistable_var_names:
                     self._quantized_weight_var_name.add(var_name)
+                    self.weight_op_pairs[var_name] = op_type
                 else:
                     self._quantized_act_var_name.add(var_name)
 
@@ -462,13 +467,15 @@ class PostTrainingQuantization(object):
             # For quantized ops, sample inputs and outputs
             if op_type in self._quantizable_op_type:
                 collect_var_name(
-                    _get_op_input_var_names(op), persistable_var_names)
+                    _get_op_input_var_names(op), persistable_var_names, op_type)
                 collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names)
+                    _get_op_output_var_names(op), persistable_var_names,
+                    op_type)
             # For other op, only sample output scale
             elif op_type in self._out_scale_op_list:
                 collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names)
+                    _get_op_output_var_names(op), persistable_var_names,
+                    op_type)
 
     def _set_activation_persistable(self):
         '''
@@ -492,45 +499,75 @@ class PostTrainingQuantization(object):
         Sample the input threshold(min, max, or abs_max) in every iterations.
         '''
         assert self._algo in ["abs_max", "min_max"], \
-            "The algo should be abs_max or min_max to sample min max value."
-
+            "The algo should be abs_max or min_max for _sample_threshold."
         if self._algo == "abs_max":
-            # Only calculate abs_max value for weight for once
-            if self._quantized_var_abs_max == {}:
-                for var_name in self._quantized_weight_var_name:
-                    var_tensor = _load_variable_data(self._scope, var_name)
-                    abs_max_per_channel = []
-                    for i in range(var_tensor.shape[0]):
-                        abs_max_per_channel.append(
-                            float(np.max(np.abs(var_tensor[i]))))
-                    self._quantized_var_abs_max[var_name] = abs_max_per_channel
-            for var_name in self._quantized_act_var_name:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                abs_max_value = float(np.max(np.abs(var_tensor)))
-                if (var_name not in self._quantized_var_abs_max) or \
-                    (abs_max_value > self._quantized_var_abs_max[var_name]):
-                    self._quantized_var_abs_max[var_name] = abs_max_value
+            self._sample_threshold_abs_max()
         elif self._algo == "min_max":
-            if self._quantized_var_min == {} and self._quantized_var_max == {}:
-                for var_name in self._quantized_weight_var_name:
-                    var_tensor = _load_variable_data(self._scope, var_name)
-                    min_per_channel = []
-                    max_per_channle = []
-                    for i in range(var_tensor.shape[0]):
-                        min_per_channel.append(float(np.min(var_tensor[i])))
-                        max_per_channle.append(float(np.max(var_tensor[i])))
-                    self._quantized_var_min[var_name] = min_per_channel
-                    self._quantized_var_max[var_name] = max_per_channle
-            for var_name in self._quantized_act_var_name:
+            self._sample_threshold_min_max()
+
+    def _sample_threshold_abs_max(self):
+        assert self._algo == "abs_max", \
+            "The algo should be abs_max for _sample_threshold_abs_max."
+        # Only calculate abs_max value for weight for once
+        if self._quantized_var_abs_max == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self.weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_var_abs_max[var_name] = abs_max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            if (var_name not in self._quantized_var_abs_max) or \
+                (abs_max_value > self._quantized_var_abs_max[var_name]):
+                self._quantized_var_abs_max[var_name] = abs_max_value
+
+    def _sample_threshold_min_max(self):
+        assert self._algo == "min_max", \
+            "The algo should be min_max for _sample_threshold_min_max."
+        if self._quantized_var_min == {} and self._quantized_var_max == {}:
+            for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
-                min_value = float(np.min(var_tensor))
-                max_value = float(np.max(var_tensor))
-                if (var_name not in self._quantized_var_min) or \
-                    (min_value < self._quantized_var_min[var_name]):
-                    self._quantized_var_min[var_name] = min_value
-                if (var_name not in self._quantized_var_max) or \
-                    (max_value > self._quantized_var_max[var_name]):
-                    self._quantized_var_max[var_name] = max_value
+                if self._weight_quantize_type == "abs_max":
+                    min_value = float(np.min(var_tensor))
+                    max_value = float(np.max(var_tensor))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    min_value = []
+                    max_value = []
+                    if self.weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            min_value.append(float(np.min(var_tensor[:, i])))
+                            max_value.append(float(np.max(var_tensor[:, i])))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            min_value.append(float(np.min(var_tensor[i])))
+                            max_value.append(float(np.max(var_tensor[i])))
+                self._quantized_var_min[var_name] = min_value
+                self._quantized_var_max[var_name] = max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            min_value = float(np.min(var_tensor))
+            max_value = float(np.max(var_tensor))
+            if (var_name not in self._quantized_var_min) or \
+                (min_value < self._quantized_var_min[var_name]):
+                self._quantized_var_min[var_name] = min_value
+            if (var_name not in self._quantized_var_max) or \
+                (max_value > self._quantized_var_max[var_name]):
+                self._quantized_var_max[var_name] = max_value
 
     def _save_input_threhold(self):
         '''
@@ -554,11 +591,6 @@ class PostTrainingQuantization(object):
         applied in every iteration.
         '''
         assert self._algo == "KL", "The algo should be KL to sample data."
-        for var_name in self._quantized_weight_var_name:
-            if var_name not in self._sampling_data:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                self._sampling_data[var_name] = var_tensor
-
         if self._is_use_cache_file:
             for var_name in self._quantized_act_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
@@ -584,15 +616,20 @@ class PostTrainingQuantization(object):
 
         # Abs_max threshold for weights
         for var_name in self._quantized_weight_var_name:
-            weight_data = self._sampling_data[var_name]
-            weight_threshold = None
+            weight_data = _load_variable_data(self._scope, var_name)
             if self._weight_quantize_type == "abs_max":
-                weight_threshold = np.max(np.abs(weight_data))
+                weight_threshold = float(np.max(np.abs(weight_data)))
             elif self._weight_quantize_type == "channel_wise_abs_max":
                 weight_threshold = []
-                for i in range(weight_data.shape[0]):
-                    abs_max_value = np.max(np.abs(weight_data[i]))
-                    weight_threshold.append(abs_max_value)
+                if self.weight_op_pairs[
+                        var_name] in _channelwise_quant_axis1_ops:
+                    for i in range(weight_data.shape[1]):
+                        weight_threshold.append(
+                            float(np.max(np.abs(weight_data[:, i]))))
+                else:
+                    for i in range(weight_data.shape[0]):
+                        weight_threshold.append(
+                            float(np.max(np.abs(weight_data[i]))))
             self._quantized_var_kl_threshold[var_name] = weight_threshold
 
         # KL threshold for activations
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 8851bcc6440d405f7484257b44760802feb0d8fb..14d1114a8f64a1238596fb1050e2cdb6e31ec6b0 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -66,6 +66,8 @@ _out_scale_op_list = [
     "concat",
     "elementwise_mul",
     "scale",
+    "hard_swish",
+    "hard_sigmoid",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -109,8 +111,14 @@ _op_real_in_out_name = {
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
+    "hard_swish": [["X"], ["Out"]],
+    "hard_sigmoid": [["X"], ["Out"]],
 }
 
+_conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
+
+_channelwise_quant_axis1_ops = ['conv2d_transpose', 'mul']
+
 
 def _get_op_input_var_names(op):
     """ """
@@ -185,10 +193,24 @@ def _is_input_all_not_persistable(graph, op_node):
     return is_input_all_not_persistable
 
 
+def _check_grandchild_op_node(op_node, grandchild_op_name):
+    '''
+    Check whether the fake_quant node has a grandchild op node named
+    grandchild_op_name.
+    '''
+    for out1_var_node in op_node.outputs:
+        for out1_op_node in out1_var_node.outputs:
+            for out2_var_node in out1_op_node.outputs:
+                for out2_op_node in out2_var_node.outputs:
+                    if out2_op_node.name() == grandchild_op_name:
+                        return True
+    return False
+
+
 class QuantizationTransformPass(object):
     """
-    Quantize the ops that have weights. Add quant and dequant ops for the quantized
-    ops's inputs.
+    Quantize the ops that have weights. Add quant and dequant ops for
+    the quantized ops's inputs.
     """
     _supported_quantizable_op_type = [
         'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul'
@@ -311,8 +333,8 @@ class QuantizationTransformPass(object):
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'."
-                % (str(weight_quantize_type)))
+                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' "
+                "or 'moving_average_abs_max'." % (str(weight_quantize_type)))
 
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
@@ -323,7 +345,6 @@ class QuantizationTransformPass(object):
         for op in self._quantizable_ops:
             assert op in QuantizationTransformPass._supported_quantizable_op_type, \
                 op + " is not supported for quantization."
-        self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
@@ -356,10 +377,12 @@ class QuantizationTransformPass(object):
             user_skipped = False
             if isinstance(self._skip_pattern, list):
                 user_skipped = op_node.op().has_attr("op_namescope") and \
-                               any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
+                               any(pattern in op_node.op().attr("op_namescope") \
+                                   for pattern in self._skip_pattern)
             elif isinstance(self._skip_pattern, str):
                 user_skipped = op_node.op().has_attr("op_namescope") and \
-                               op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
+                               op_node.op().attr("op_namescope").find(
+                                   self._skip_pattern) != -1
 
             if user_skipped:
                 op_node.op()._set_attr("skip_quant", True)
@@ -373,15 +396,11 @@ class QuantizationTransformPass(object):
                 if var_node.name() in dequantized_vars:
                     dequant_var_node = dequantized_vars[var_node.name()]
                 else:
-
                     name = var_node.name()
                     if name in processed_vars:
                         continue
-
-                    if var_node.name() in persistable_vars:
-                        is_weight = True
-                    else:
-                        is_weight = False
+                    is_weight = True if var_node.name() in persistable_vars \
+                        else False
 
                     # if var node is weight and weight_preprocess_func is not None,
                     # will insert weight preprocess func 
@@ -415,20 +434,14 @@ class QuantizationTransformPass(object):
                         else self._activation_bits
                     quant_type = self._weight_quantize_type if is_weight \
                         else self._activation_quantize_type
-                    if quant_type == 'channel_wise_abs_max':
-                        assert is_weight, "'channel_wise_abs_max' can only be applied on weights."
-                        if op.name() in self._conv_ops:
-                            quant_var_node, scale_var_node = self._insert_channel_quant_op(
-                                graph, var_node, name, quant_bits)
-                            dequant_var_node = self._insert_channel_dequant_op(
-                                graph, quant_var_node, [scale_var_node],
-                                [quant_bits])
-                        else:
-                            quant_var_node, scale_var_node = self._insert_quant_op(
-                                graph, var_node, name, quant_bits, 'abs_max')
-                            dequant_var_node = self._insert_dequant_op(
-                                graph, quant_var_node, scale_var_node,
-                                quant_bits)
+                    if quant_type == 'channel_wise_abs_max':  # Weight quantization
+                        quant_axis = 1 if op.name() in \
+                            _channelwise_quant_axis1_ops else 0
+                        quant_var_node, scale_var_node = self._insert_channel_quant_op(
+                            graph, var_node, name, quant_bits, quant_axis)
+                        dequant_var_node = self._insert_channel_dequant_op(
+                            graph, quant_var_node, [scale_var_node],
+                            [quant_bits], quant_axis)
                     else:
                         quant_var_node, scale_var_node = self._insert_quant_op(
                             graph, var_node, name, quant_bits, quant_type)
@@ -529,11 +542,19 @@ class QuantizationTransformPass(object):
             var_type=var_node.type(),
             shape=var_node.shape(),
             var_dtype=var_node.dtype())
-        scale_var_node = graph.create_var_node(
+        scale_var_node = graph.create_persistable_node(
             name=self._quantized_scale_name(name),
             var_type=var_node.type(),
             shape=[1],
             var_dtype=var_node.dtype())
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        _init_var_node(
+            scale_var_node,
+            np.zeros(
+                scale_var_node.shape(), dtype=data_type),
+            self._scope,
+            self._place)
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
             attrs={
@@ -706,7 +727,8 @@ class QuantizationTransformPass(object):
 
         return quant_var_node, scale_out_node
 
-    def _insert_channel_quant_op(self, graph, var_node, name, quant_bits):
+    def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
+                                 quant_axis):
         """
         Insert fake_channel_wise_quantize_abs_max op in the graph.
         """
@@ -717,15 +739,24 @@ class QuantizationTransformPass(object):
             var_type=var_node.type(),
             shape=var_node.shape(),
             var_dtype=var_node.dtype())
-        scale_var_node = graph.create_var_node(
+        scale_var_node = graph.create_persistable_node(
             name=self._quantized_scale_name(name),
             var_type=var_node.type(),
-            shape=[var_node.shape()[0]],
+            shape=[var_node.shape()[quant_axis]],
             var_dtype=var_node.dtype())
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        _init_var_node(
+            scale_var_node,
+            np.zeros(
+                scale_var_node.shape(), dtype=data_type),
+            self._scope,
+            self._place)
         quant_op_node = graph.create_op_node(
             op_type='fake_channel_wise_quantize_abs_max',
             attrs={
                 'bit_length': quant_bits,
+                'quant_axis': quant_axis,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
@@ -763,7 +794,7 @@ class QuantizationTransformPass(object):
         return dequant_var_node
 
     def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
-                                   quant_bits):
+                                   quant_bits, quant_axis):
         """
         Insert fake_channel_wise_dequantize_max_abs in the graph.
         """
@@ -778,6 +809,7 @@ class QuantizationTransformPass(object):
             op_type='fake_channel_wise_dequantize_max_abs',
             attrs={
                 'quant_bits': quant_bits,
+                'quant_axis': quant_axis,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node,
@@ -1036,7 +1068,6 @@ class QuantizationFreezePass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._weight_quantize_type = weight_quantize_type
-        self._conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
         self._fake_quant_op_names = _fake_quant_op_list
         self._fake_dequant_op_names = _fake_dequant_op_list
         self._op_input_rename_map = collections.OrderedDict()
@@ -1063,34 +1094,37 @@ class QuantizationFreezePass(object):
                     if input_arg_name in graph.out_node_mapping_table.keys():
                         input_arg_name = graph.out_node_mapping_table[
                             input_arg_name]
-                if input_arg_name in persistable_vars:
-                    if self._weight_quantize_type == 'abs_max':
-                        param = self._load_var(input_arg_name)
-                        scale_v = np.max(np.abs(param))
-                    elif self._weight_quantize_type == 'channel_wise_abs_max':
-                        param = self._load_var(input_arg_name)
-                        if len(param.shape) == 4:  # conv2d or depthwise_conv2d
-                            scale_v = []
-                            for i in range(param.shape[0]):
-                                scale_v.append(np.max(np.abs(param[i])))
-                        else:
-                            scale_v = np.max(np.abs(param))
+                if input_arg_name not in persistable_vars:
+                    scale_v = graph._find_node_by_name(
+                        op_node.outputs, op_node.output('OutScale')[0])
+                    self._quant_var_scale_map[input_arg_name] = scale_v
+                else:
+                    # Obtain scale from OutScale var node
+                    scale_v = self._load_var(op_node.output('OutScale')[0])
+                    assert scale_v.ndim in [
+                        1, 2
+                    ], "the dim of scale_v should be 1 or 2"
+                    if scale_v.ndim == 2:
+                        scale_v = scale_v[0]
+                    if scale_v.size == 1:
+                        scale_v = scale_v[0]
                     else:
-                        scale_v = self._load_var(
-                            op_node.output('OutScale')[0])[0]
+                        scale_v = scale_v.tolist()
                     self._quant_var_scale_map[input_arg_name] = scale_v
-                    self._remove_fake_quant_and_dequant_op(graph, op_node)
-                    # quantize weight and restore
+                    # Quantize weight and restore
                     param_v = self._load_var(input_arg_name)
-                    quantized_param_v = self._quant(param_v, scale_v,
-                                                    self._weight_bits)
+                    if isinstance(scale_v, list) and \
+                        any(_check_grandchild_op_node(op_node, op)
+                        for op in _channelwise_quant_axis1_ops):
+                        quant_axis = 1
+                    else:
+                        quant_axis = 0
+                    quantized_param_v = self._quant(
+                        param_v, scale_v, self._weight_bits, quant_axis)
                     self._restore_var(input_arg_name, quantized_param_v)
-                else:
-                    scale_v = graph._find_node_by_name(
-                        op_node.outputs, op_node.output('OutScale')[0])
-                    self._quant_var_scale_map[input_arg_name] = scale_v
+                    self._remove_fake_quant_and_dequant_op(graph, op_node)
 
-        # Remove all fake dequant op
+# Remove all fake dequant op
         ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
@@ -1103,8 +1137,7 @@ class QuantizationFreezePass(object):
             op_node_desc = op_node.op()
             if op_node_desc.has_attr("quantization_type") and \
                 op_node_desc.attr("quantization_type") == "qat_with_weight":
-                if self._weight_quantize_type == 'channel_wise_abs_max' \
-                    and op_node.name() in self._conv_ops:
+                if self._weight_quantize_type == 'channel_wise_abs_max':
                     self._insert_post_channel_dequant_op(graph, op_node)
                 else:
                     self._insert_post_dequant_op(graph, op_node)
@@ -1295,10 +1328,15 @@ class QuantizationFreezePass(object):
         return isinstance(v, float) or isinstance(v, np.float32) \
             or isinstance(v, np.float64)
 
-    def _quant(self, x, scale, num_bits):
+    def _quant(self, x, scale, num_bits, quant_axis):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
         if isinstance(scale, list):
             for i, s in enumerate(scale):
-                x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                if quant_axis == 0:
+                    x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                else:
+                    x[:, i] = np.round(x[:, i] / s * (
+                        (1 << (num_bits - 1)) - 1))
             return x
         else:
             return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
@@ -1468,6 +1506,10 @@ class OutScaleForTrainingPass(object):
         for op in target_ops:
             for output_var_name in _get_op_output_var_names(op):
                 in_node = graph._find_node_by_name(op.outputs, output_var_name)
+                if in_node.dtype() not in \
+                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                    continue
+
                 scale_node = graph.create_persistable_node(
                     name=self._scale_name(in_node.name()),
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
@@ -1570,17 +1612,26 @@ class OutScaleForInferencePass(object):
             if op_node.name() in self._teller_set:
                 var_names = _get_op_output_var_names(op_node)
                 for var_name in var_names:
-                    # For compatibility, we save output threshold by two methods.
+                    in_node = graph._find_node_by_name(op_node.outputs,
+                                                       var_name)
+                    if in_node.dtype() not in \
+                        [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                        continue
+
                     scale_name = self._scale_name(var_name)
-                    scale_v = np.array(
-                        self._scope.find_var(scale_name).get_tensor())[0]
-                    op_node.op()._set_attr("out_threshold", float(scale_v))
+                    scale_var = self._scope.find_var(scale_name)
+                    assert scale_var is not None, \
+                        "Can not find {} variable in the scope".format(scale_name)
+                    scale_value = np.array(scale_var.get_tensor())[0]
+
+                    # For compatibility, we save output threshold by two methods.
+                    op_node.op()._set_attr("out_threshold", float(scale_value))
 
                     argname_index = _get_output_name_index(op_node, var_name)
                     assert argname_index is not None, \
                         var_name + " is not the output of the op"
                     op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \
-                        + "_threshold", float(scale_v))
+                        + "_threshold", float(scale_value))
         graph.resolve_hazard()
         return graph
 
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index df7e585d45f445067b3a700951418c06c9062ae7..f0943b72add6a7a37b5da45853bdddc8c5224a3e 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -123,6 +123,7 @@ endfunction()
 
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
     list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
     list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
@@ -263,6 +264,13 @@ list(REMOVE_ITEM TEST_OPS
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
 LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+# setting timeout value for old unittests
+if(NOT WIN32)
+	set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250)
+	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200)
+endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac1590b8aa6eaefbccd3907b314fb438386ffc6
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -0,0 +1,226 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path, exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(self.int8_model_path)
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "mul"]
+        is_full_quantize = True
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 10
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index c9ea15bf6cde9af16810920f53a7d5e045a852e3..32292c8a47b50bc5e7eb2d7833823e586eea8909 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -33,34 +33,29 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
 
-def residual_block(img, label, num=1):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            use_cudnn=False,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    hidden = img
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 20, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 20, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        pool_type='max',
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        pool_type='avg',
+        act="relu")
+    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
 
 
 def pact(x, name=None):
@@ -102,7 +97,7 @@ class TestUserDefinedQuantization(unittest.TestCase):
                     img.stop_gradient = False
                     label = fluid.layers.data(
                         name='label', shape=[1], dtype='int64')
-                    loss = residual_block(img, label, 1)
+                    loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.SGD(learning_rate=0.0001)
                         opt.minimize(loss)
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
index 51e1455e71ecfe3f347977bea17a56e556c5ce0d..b964168eb3a2f14fa6dd55d189592daa6ec93d3c 100644
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
@@ -36,8 +36,9 @@ class TestDistributedReader(unittest.TestCase):
         data = next(reader())
         assert data == 1
 
-        os.unsetenv('PADDLE_TRAINER_ID')
-        os.unsetenv('PADDLE_TRAINERS_NUM')
+        #Note: windows python3 don't have unsetenv
+        del os.environ['PADDLE_TRAINER_ID']
+        del os.environ['PADDLE_TRAINERS_NUM']
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d87363abf14cdfc3e29567bd41dbac387b882f76..a05aa3b0a84b57bb1f9ce00b0ad007280c316c6e 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -201,10 +201,13 @@ def pre_load(dso_name):
 
 
 def get_glibc_ver():
-    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'").strip()
+    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
 
 
 def less_than_ver(a, b):
+    if a is None or b is None:
+        return False
+
     import re
     import operator
 
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 2c75c493cba02dc21a5e2518a8a5e52b6eb4fd81..dc57e9f71ed3d0de1a374bdf719b32a083198b31 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -18,17 +18,14 @@ import six
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_dtype, check_type
+from ..utils import deprecated
 
 __all__ = ['data']
 
 
+@deprecated(since="2.0.0", update_to="paddle.static.data")
 def data(name, shape, dtype='float32', lod_level=0):
     """
-    :api_attr: Static Graph
-	:alias_main: paddle.nn.data
-	:alias: paddle.nn.data,paddle.nn.input.data
-	:old_api: paddle.fluid.data
-
     **Data Layer**
 
     This function creates a variable on the global block. The global variable
@@ -52,7 +49,7 @@ def data(name, shape, dtype='float32', lod_level=0):
 
         The default :code:`stop_gradient` attribute of the Variable created by
         this API is true, which means the gradient won't be passed backward
-        through the data Varaible. Set :code:`var.stop_gradient = False` If
+        through the data Variable. Set :code:`var.stop_gradient = False` If
         user would like to pass backward gradient.
 
     Args:
@@ -88,7 +85,7 @@ def data(name, shape, dtype='float32', lod_level=0):
 
           z = x + y
 
-          # In this example, we will feed x and y with np-ndarry "1"
+          # In this example, we will feed x and y with np-ndarray "1"
           # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
           feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index e8d708e04ce54bf6589ada0a55de13f06f0ba2a9..45aa85d4168a55e206460ce2e39292013caa9ce0 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -50,14 +50,15 @@ def convert_dtype(dtype):
     elif isinstance(dtype, type):
         if dtype in [
                 np.bool, np.float16, np.float32, np.float64, np.int8, np.int16,
-                np.int32, np.int64, np.uint8
+                np.int32, np.int64, np.uint8, np.complex64, np.complex128
         ]:
             return dtype.__name__
     else:
         if dtype in [
                 'bool', 'float16', 'float32', 'float64', 'int8', 'int16',
-                'int32', 'int64', 'uint8', u'bool', u'float16', u'float32',
-                u'float64', u'int8', u'int16', u'int32', u'int64', u'uint8'
+                'int32', 'int64', 'uint8', 'complex64', 'complex128', u'bool',
+                u'float16', u'float32', u'float64', u'int8', u'int16', u'int32',
+                u'int64', u'uint8', u'complex64', u'complex128'
         ]:
             # this code is a little bit dangerous, since error could happen
             # when casting no-ascii code to str in python2.
@@ -68,7 +69,7 @@ def convert_dtype(dtype):
 
     raise TypeError(
         "dtype must be any of [bool, float16, float32, float64, int8, int16, "
-        "int32, int64, uint8], but received %s" % dtype)
+        "int32, int64, uint8, complex64, complex128], but received %s" % dtype)
 
 
 def check_variable_and_dtype(input,
diff --git a/python/paddle/fluid/dataloader/__init__.py b/python/paddle/fluid/dataloader/__init__.py
index 62aefd6aec8cb92faf4595a821381f35c2abd5bd..597f1f217483ccafe73d0b4fe337cb2b24b4b436 100644
--- a/python/paddle/fluid/dataloader/__init__.py
+++ b/python/paddle/fluid/dataloader/__init__.py
@@ -20,5 +20,13 @@ from .dataset import *
 from . import batch_sampler
 from .batch_sampler import *
 
+from . import dataloader_iter
+from .dataloader_iter import *
+
+from . import sampler
+from .sampler import *
+
 __all__ = dataset.__all__ \
-        + batch_sampler.__all__
+        + batch_sampler.__all__ \
+        + dataloader_iter.__all__ \
+        + sampler.__all__
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index c6163f7da1ee6de139b7503e8e6a6c3722ea8a7b..1d180329b72510de5e7e9362e4c002f4508ba1be 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -16,12 +16,13 @@ from __future__ import print_function
 from __future__ import division
 
 import numpy as np
-from .dataset import Dataset
+from .sampler import Sampler, SequenceSampler, RandomSampler
+from .dataset import Dataset, IterableDataset
 
 __all__ = ["BatchSampler"]
 
 
-class BatchSampler(object):
+class BatchSampler(Sampler):
     """
     A base implement of batch sampler used by `paddle.io.DataLoader`
     which yield mini-batch indices(a list/tuple with length as
@@ -41,10 +42,11 @@ class BatchSampler(object):
                 implement or other python object which implemented
                 :code:`__len__` for BatchSampler to get indices as the
                 range of :attr:`dataset` length. Default None.
-        indices (list|tuple): a substitution parameter for
-                :attr:`dataset` either :attr:`dataset` or
-                :attr:`indices` should be set, give the whole
-                indices to sampler from directly. Default None.
+        sampler (Sampler): this could be a :code:`paddle.io.Dataset`
+                instance which implemented :code:`__iter__` to yield
+                sample indices. :attr:`sampler` and :attr:`dataset`
+                can not be set in the same time.  If :attr:`sampler`
+                is set, :attr:`shuffle` should not be set. Default None.
         shuffle(bool): whether to shuffle indices order before genrating
                 batch indices. Default False.
         batch_size(int): sample indice number in a mini-batch indices.
@@ -58,16 +60,7 @@ class BatchSampler(object):
         
         .. code-block:: python
             
-            from paddle.io import BatchSampler, Dataset
-
-            # init with indices
-            bs = BatchSampler(indices=list(range(100)),
-                              shuffle=True,
-                              batch_size=8,
-                              drop_last=True)
-
-            for batch_indices in bs:
-                print(batch_indices)
+            from paddle.io import RandomSampler, BatchSampler, Dataset
 
             # init with dataset
             class RandomDataset(Dataset):
@@ -90,46 +83,57 @@ class BatchSampler(object):
             for batch_indices in bs:
                 print(batch_indices)
 
+            # init with sampler
+            sampler = RandomSampler(RandomDataset(100))
+            bs = BatchSampler(sampler=sampler,
+                              batch_size=8,
+                              drop_last=True)
+
+            for batch_indices in bs:
+                print(batch_indices)
+
+
     see `paddle.io.DataLoader`
 
     """
 
     def __init__(self,
                  dataset=None,
-                 indices=None,
+                 sampler=None,
                  shuffle=False,
                  batch_size=1,
                  drop_last=False):
         if dataset is None:
-            assert indices is not None, \
-                "either dataset or indices should be set"
-            assert isinstance(indices, list) or isinstance(indices, tuple), \
-                "indices should be a list or tuple, but got {}".format(type(indices))
-            self.indices = indices
+            assert sampler is not None, \
+                "either dataset or sampler should be set"
+            assert isinstance(sampler, Sampler), \
+                "sampler should be a paddle.io.Sampler, but got {}".format(type(sampler))
+            assert not shuffle, "shuffle should be False when sampler is set"
+            self.sampler = sampler
         else:
             assert isinstance(dataset, Dataset), \
-                "dataset should be an instance of paddle.io.Dataset"
-            assert indices is None, \
-                "should not set both dataset and indices"
-            self.indices = list(range(len(dataset)))
+                "dataset should be a paddle.io.Dataset"
+            assert not isinstance(dataset, IterableDataset), \
+                "dataset should not be a paddle.io.IterableDataset"
+            assert sampler is None, \
+                "should not set both dataset and sampler"
+            assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value, but got {}".format(type(shuffle))
+            if shuffle:
+                self.sampler = RandomSampler(dataset)
+            else:
+                self.sampler = SequenceSampler(dataset)
 
         assert isinstance(batch_size, int) and batch_size > 0, \
             "batch_size should be a positive integer, but got {}".format(batch_size)
         self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-            "shuffle should be a boolean value, but got {}".format(type(shuffle))
-        self.shuffle = shuffle
         assert isinstance(drop_last, bool), \
             "drop_last should be a boolean value, but got {}".format(type(drop_last))
         self.drop_last = drop_last
 
     def __iter__(self):
-        if self.shuffle:
-            np.random.shuffle(self.indices)
-        _iter = iter(self.indices)
-
         batch_indices = []
-        for idx in _iter:
+        for idx in self.sampler:
             batch_indices.append(idx)
             if len(batch_indices) == self.batch_size:
                 yield batch_indices
@@ -138,6 +142,19 @@ class BatchSampler(object):
             yield batch_indices
 
     def __len__(self):
-        num_samples = len(self.indices)
+        num_samples = len(self.sampler)
         num_samples += int(not self.drop_last) * (self.batch_size - 1)
         return num_samples // self.batch_size
+
+
+class _InfiniteIterableSampler(object):
+    def __init__(self, dataset, batch_size=1):
+        assert isinstance(
+            dataset, IterableDataset
+        ), "dataset should be an instance of paddle.io.IterableDataset"
+        self.dataset = dataset
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        while True:
+            yield [None] * self.batch_size
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 214cd772af6b1fa6e24ec972c0f0644dc1c09f95..a81d73d7e9a621d2a02ed91541f32b827bdff38c 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -22,6 +22,7 @@ import itertools
 import threading
 import numpy as np
 import multiprocessing
+from collections import namedtuple
 
 # NOTE: queue has a different name in python2 and python3
 if six.PY2:
@@ -32,11 +33,17 @@ else:
 from .. import core
 from ..framework import in_dygraph_mode
 from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
+from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
+
+__all__ = ['get_worker_info']
 
 # multi-process worker check indices queue interval, avoid
 # hanging in subprocess data loading
 MP_INDICES_CHECK_INTERVAL = 5
 
+_IterableDatasetStopIteration = namedtuple('_IterableDatasetStopIteration',
+                                           ['worker_id'])
+
 
 def default_collate_fn(batch):
     """
@@ -75,6 +82,20 @@ def default_collate_fn(batch):
     return [np.stack(slot, axis=0) for slot in slots]
 
 
+class _DatasetKind(object):
+    MAP = 0
+    ITER = 1
+
+    @staticmethod
+    def create_fetcher(kind, dataset, collate_fn, drop_last):
+        if kind == _DatasetKind.MAP:
+            return _MapDatasetFetcher(dataset, collate_fn, drop_last)
+        elif kind == _DatasetKind.ITER:
+            return _IterableDatasetFetcher(dataset, collate_fn, drop_last)
+        else:
+            raise NotImplementedError("unknown Dataset kind {}".format(kind))
+
+
 class ParentWatchDog(object):
     def __init__(self):
         self._parent_pid = os.getppid()
@@ -86,6 +107,92 @@ class ParentWatchDog(object):
         return self._parent_alive
 
 
+# worker information for each workers, used for splitting data copy
+# for IteratorDataset in worker processes.
+_worker_info = None
+
+
+def get_worker_info():
+    """
+    Get DataLoader worker process information function, this function is
+    used to split data copy in worker process for IterableDataset
+    (see :code:`paddle.io.IterableDataset`), worker information contains
+    following fields:
+
+    :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader`
+
+    :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1`
+
+    :attr:`dataset`: the dataset object in this worker process
+
+    Returns:
+        WorkerInfo: an instance of WorkerInfo which contains fields above.
+
+    .. note::
+        For mode usage and exampls, please see :code:`paddle.io.IterableDataset`
+
+    Example:
+
+        .. code-block:: python
+
+            import math
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class SplitedIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    worker_info = get_worker_info()
+                    if worker_info is None:
+                        iter_start = self.start
+                        iter_end = self.end
+                    else:
+                        per_worker = int(
+                            math.ceil((self.end - self.start) / float(
+                                worker_info.num_workers)))
+                        worker_id = worker_info.id
+                        iter_start = self.start + worker_id * per_worker
+                        iter_end = min(iter_start + per_worker, self.end)
+
+                    for i in range(iter_start, iter_end):
+                        yield np.array([i])
+
+            place = fluid.CPUPlace()
+            with fluid.dygraph.guard(place):
+                dataset = SplitedIterableDataset(start=2, end=9)
+                dataloader = DataLoader(
+                    dataset,
+                    places=place,
+                    num_workers=2,
+                    batch_size=1,
+                    drop_last=True)
+
+                print(list(dataloader))
+                # outputs: [2, 5, 3, 6, 4, 7]
+
+    """
+    return _worker_info
+
+
+class WorkerInfo(object):
+    __initialized = False
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self.__initialized = True
+
+    def __setattr__(self, key, val):
+        if self.__initialized:
+            raise RuntimeError("Cannot assign attributes to {} objects".format(
+                self.__class__.__name__))
+        return super(WorkerInfo, self).__setattr__(key, val)
+
+
 class _DataLoaderIterBase(object):
     """
     Iterator implement of DataLoader, will load and feed mini-batch
@@ -108,6 +215,8 @@ class _DataLoaderIterBase(object):
         self._use_shared_memory = loader.use_shared_memory
         self._timeout = loader.timeout if loader.timeout > 0 else MP_INDICES_CHECK_INTERVAL
         self._worker_init_fn = loader.worker_init_fn
+        self._dataset_kind = loader.dataset_kind
+        self._pin_memory = loader.pin_memory
 
         # LoDTensorBlockingQueue instance for create_py_reader and a thread
         # to put mini-batch data to self._blocking_queue, mini-batch data
@@ -134,6 +243,9 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
     def __init__(self, loader):
         super(_DataLoaderIterSingleProcess, self).__init__(loader)
 
+        self._dataset_fetcher = _DatasetKind.create_fetcher(
+            self._dataset_kind, self._dataset, self._collate_fn, True)
+
         # NOTE: len(self._places) batch data compose as an output
         # iteration, set blocking_queue can cache 2 iteration datas
         # at most here
@@ -154,7 +266,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
             len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_buffer_reader, True)
+            self._need_check_feed, self._places, self._use_buffer_reader, True,
+            self._pin_memory)
 
         self._thread = threading.Thread(target=self._thread_loop)
         self._thread.daemon = True
@@ -164,9 +277,7 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         try:
             for indices in self._sampler_iter:
                 # read data from dataset in mini-batch
-                batch = [self._dataset[i] for i in indices]
-                if self._collate_fn is not None:
-                    batch = self._collate_fn(batch)
+                batch = self._dataset_fetcher.fetch(indices)
 
                 # pack as LoDTensorArray
                 array = core.LoDTensorArray()
@@ -184,6 +295,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
             self._blocking_queue.close()
             self._thread = None
+        except StopIteration:
+            self._blocking_queue.close()
         except Exception:
             self._blocking_queue.kill()
             self._thread = None
@@ -231,11 +344,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
         # data get from _data_queue will be reordered by _rcvd_idx
         # for data order keeping, data index not equal _rcvd_idx 
-        # will be cached in _reorder_dict
+        # will be cached in _task_infos
         self._send_idx = 0
         self._rcvd_idx = 0
         self._batches_outstanding = 0
-        self._reorder_dict = {}
+        self._task_infos = {}
 
         # indices outstand as _outstanding_capacity at first, and
         # blocking_queue capacity is also _outstanding_capacity.
@@ -246,14 +359,17 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._outstanding_capacity = 2 * max(self._num_workers,
                                              len(self._places))
 
-        self._init_workers()
-        self._init_thread()
-
-        self._shutdown = False
+        # see _try_put_indices
+        self._thread_lock = threading.Lock()
 
+        # init workers and indices queues and put 2 indices in each indices queue
+        self._init_workers()
         for _ in range(self._outstanding_capacity):
             self._try_put_indices()
 
+        self._init_thread()
+        self._shutdown = False
+
     def _init_workers(self):
         # multiprocess worker and indice queue list initial as empty
         self._workers = []
@@ -274,9 +390,10 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             self._indices_queues.append(indices_queue)
             worker = multiprocessing.Process(
                 target=self._worker_loop,
-                args=(self._dataset, indices_queue, self._data_queue,
-                      self._workers_done_event, self._collate_fn,
-                      self._worker_init_fn, i))
+                args=(self._dataset, self._dataset_kind, indices_queue,
+                      self._data_queue, self._workers_done_event,
+                      self._collate_fn, self._worker_init_fn, i,
+                      self._num_workers))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
@@ -307,7 +424,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             core.Variable(), self._outstanding_capacity, len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_buffer_reader, True)
+            self._need_check_feed, self._places, self._use_buffer_reader, True,
+            self._pin_memory)
 
         self._thread_done_event = threading.Event()
         self._thread = threading.Thread(target=self._thread_loop)
@@ -350,8 +468,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _worker_loop(self, dataset, indices_queue, out_queue, done_event,
-                     collate_fn, init_fn, worker_id):
+    def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
+                     done_event, collate_fn, init_fn, worker_id, num_workers):
         try:
             # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
             # some shared memory objects may have been applied for but have not yet
@@ -362,14 +480,21 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             # set signal handler
             core._set_process_signal_handler()
 
+            global _worker_info
+            _worker_info = WorkerInfo(
+                id=worker_id, num_workers=num_workers, dataset=dataset)
+
             init_exception = None
-            if init_fn is not None:
-                try:
+            try:
+                if init_fn is not None:
                     init_fn(worker_id)
-                except:
-                    init_exception = Exception("init_fn failed in worker {}: " \
-                                         "{}".format(worker_id, sys.exc_info()))
+                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
+                                                      collate_fn, True)
+            except:
+                init_exception = Exception("init_fn failed in worker {}: " \
+                                     "{}".format(worker_id, sys.exc_info()))
 
+            iterator_drained = False
             parent_watch_dog = ParentWatchDog()
 
             while parent_watch_dog.is_alive():
@@ -380,12 +505,12 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
                 # None as poison piil, so worker event should be set
                 if data is None:
-                    assert done_event.is_set(
-                    ), "get None when worker done_event set"
+                    assert done_event.is_set() or iterator_drained, \
+                            "get None when worker done_event set"
                     break
                 # If worker done event is set but get still get data in
                 # indices_queue, remaining data should be get and skipped.
-                if done_event.is_set():
+                if done_event.is_set() or iterator_drained:
                     continue
 
                 idx, indices = data
@@ -394,11 +519,15 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         batch = init_exception
                         init_exception = None
                     else:
-                        batch = [dataset[i] for i in indices]
-                        if self._collate_fn is not None:
-                            batch = self._collate_fn(batch)
+                        batch = fetcher.fetch(indices)
                 except Exception as e:
-                    out_queue.put((idx, e))
+                    if isinstance(
+                            e,
+                            StopIteration) and dataset_kind == _DatasetKind.ITER:
+                        out_queue.put(_IterableDatasetStopIteration(worker_id))
+                        iterator_drained = True
+                    else:
+                        out_queue.put((idx, e))
                 else:
                     if self._use_shared_memory:
                         tensor_list = core._convert_to_tensor_list(batch)
@@ -435,7 +564,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                             # serializable, cannot be create in workers
                             for slot in batch:
                                 if not isinstance(slot, core.LoDTensor):
-                                    # self._check_input_array(slot)
                                     tmp = core.LoDTensor()
                                     tmp.set(slot, core.CPUPlace())
                                     slot = tmp
@@ -450,10 +578,31 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                         self._rcvd_idx += 1
 
     def _get_data(self):
-        if self._rcvd_idx in self._reorder_dict.keys():
-            return self._reorder_dict.pop(self._rcvd_idx)
-
         while not self._thread_done_event.is_set():
+            # For IterableDataset, batch indices is generated infinitely
+            # for each worker to raise StopIteration, but a StopIteration
+            # raising process will discard a batch indices which is count
+            # in _send_idx but will not increase _rcvd_idx, so we check 
+            # whether the worker is still alive here to skip the discarded
+            # batch indices and increase _rcvd_idx
+            while self._rcvd_idx < self._send_idx:
+                info = self._task_infos[self._rcvd_idx]
+                if len(info) == 2 or self._worker_status[info[0]]:
+                    break
+                del self._task_infos[self._rcvd_idx]
+                self._rcvd_idx += 1
+                self._batches_outstanding -= 1
+            else:
+                # NOTE: _rcvd_idx and _send_idx only record batches among
+                #       workers, if batches among workers drained, there
+                #       may also be data in blocking queue
+                if self._batches_outstanding < len(self._places):
+                    return None
+                continue
+
+            if len(self._task_infos[self._rcvd_idx]) == 2:
+                return self._task_infos.pop(self._rcvd_idx)[1]
+
             try:
                 # [ avoid hang ]: main process may blocking at _reader.read_next when
                 # KeyboardInterrupt, we do following tradeoff:
@@ -491,25 +640,55 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                               "workers' result queue.".format(e))
                 six.reraise(*sys.exc_info())
             else:
+                if self._dataset_kind == _DatasetKind.ITER and isinstance(
+                        data, _IterableDatasetStopIteration):
+                    # if a worker get StopIteraion, we shutdown this worker,
+                    # note that this batch indices to trigger StopIteration
+                    # is discard, outstanding batch number should be decrease
+                    # and another indices should be put for other workers
+                    # may still working.
+                    self._shutdown_worker(data.worker_id)
+                    self._batches_outstanding -= 1
+                    self._try_put_indices()
+                    continue
+
                 idx, batch = data
                 if idx == self._rcvd_idx:
+                    del self._task_infos[idx]
                     return batch
                 else:
-                    self._reorder_dict[idx] = batch
+                    self._task_infos[idx] += (batch, )
                     continue
 
     def _try_put_indices(self):
-        assert self._send_idx - self._rcvd_idx <= self._outstanding_capacity, \
+        assert self._batches_outstanding <= self._outstanding_capacity, \
                     "too many indices have been put to queue"
-        try:
-            indices = next(self._sampler_iter)
-        except StopIteration:
-            return
+        # In multi-process mode for IterableDataset, _try_put_indices will
+        # be called both in main process(for our implement has blocking queue,
+        # and blocking queue read is in main process) and thread, which may
+        # cause error following error
+        #   1. "ValueError: generator already executing" in next(self._sampler_iter)
+        #   2. re-enter in increase _send_idx
+        # add a lock for threading save, for _try_put_indices is only a slight
+        # function which is not in data reading pipeline, this lock almost no
+        # influence on performance
+        with self._thread_lock:
+            try:
+                indices = next(self._sampler_iter)
+            except StopIteration:
+                return
+
+            for i in range(self._num_workers):
+                worker_idx = next(self._workers_idx_cycle)
+                if self._worker_status[worker_idx]:
+                    break
+            else:
+                return
 
-        worker_idx = next(self._workers_idx_cycle)
-        self._indices_queues[worker_idx].put((self._send_idx, indices))
-        self._batches_outstanding += 1
-        self._send_idx += 1
+            self._indices_queues[worker_idx].put((self._send_idx, indices))
+            self._task_infos[self._send_idx] = (worker_idx, )
+            self._batches_outstanding += 1
+            self._send_idx += 1
 
     def __del__(self):
         self._try_shutdown_all()
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index b49ceaddefdef272d34ea9008d57d3f1ced43205..e47f57381c0decaf365df58e96d6386a01f9ef1e 100644
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -16,12 +16,12 @@ from __future__ import print_function
 
 import paddle.dataset.common
 
-__all__ = ["Dataset"]
+__all__ = ["Dataset", "IterableDataset"]
 
 
 class Dataset(object):
     """
-    An abstract class to encapsulates methods and behaviors of datasets.
+    An abstract class to encapsulate methods and behaviors of datasets.
 
     All datasets in map-style(dataset samples can be get by a given key)
     should be a subclass of `paddle.io.Dataset`. All subclasses should
@@ -71,3 +71,154 @@ class Dataset(object):
     def __len__(self):
         raise NotImplementedError("'{}' not implement in class "\
                 "{}".format('__len__', self.__class__.__name__))
+
+
+class IterableDataset(Dataset):
+    """
+    An abstract class to encapsulate methods and behaviors of iterable datasets.
+
+    All datasets in iterable-style (can only get sample one by one sequentially, like
+    a Python iterator) should be a subclass of `paddle.io.IterableDataset`. All subclasses should
+    implement following methods:
+
+    :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :code:`paddle.io.DataLoader`.
+
+    .. note::
+        do not implement :code:`__getitem__` and :code:`__len__` in IterableDataset, should not be called either.
+
+    see :code:`paddle.io.DataLoader`.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import numpy as np
+            from paddle.io import Dataset
+            
+            # define a random dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __iter__(self):
+                    for i in range(self.num_samples):
+                        image = np.random.random([784]).astype('float32')
+                        label = np.random.randint(0, 9, (1, )).astype('int64')
+                        yield image, label
+            
+            dataset = RandomDataset(10)
+            for img, lbl in dataset:
+                print(img, lbl)
+
+    When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
+    will yield whole dataset samples, which means samples in dataset will be repeated in
+    :attr:`num_workers` times. If it is required for each sample to yield only once, there
+    are two methods to configure different copy in each worker process to avoid duplicate data
+    among workers as follows. In both the methods, worker information that can be getted in
+    a worker process by `paddle.io.get_worker_info` will be needed.
+
+    Example 1: splitting data copy in each worker in :code:`__iter__`
+
+        .. code-block:: python
+
+            import math
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class SplitedIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    worker_info = get_worker_info()
+                    if worker_info is None:
+                        iter_start = self.start
+                        iter_end = self.end
+                    else:
+                        per_worker = int(
+                            math.ceil((self.end - self.start) / float(
+                                worker_info.num_workers)))
+                        worker_id = worker_info.id
+                        iter_start = self.start + worker_id * per_worker
+                        iter_end = min(iter_start + per_worker, self.end)
+
+                    for i in range(iter_start, iter_end):
+                        yield np.array([i])
+
+            place = fluid.CPUPlace()
+            with fluid.dygraph.guard(place):
+                dataset = SplitedIterableDataset(start=2, end=9)
+                dataloader = DataLoader(
+                    dataset,
+                    places=place,
+                    num_workers=2,
+                    batch_size=1,
+                    drop_last=True)
+
+                print(list(dataloader))
+                # outputs: [2, 5, 3, 6, 4, 7]
+
+    Example 2: splitting data copy in each worker by :code:`worker_init_fn`
+
+        .. code-block:: python
+
+            import math
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class RangeIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    for i in range(self.start, self.end):
+                        yield np.array([i])
+
+            place = fluid.CPUPlace()
+            with fluid.dygraph.guard(place):
+                dataset = RangeIterableDataset(start=2, end=9)
+
+                def worker_init_fn(worker_id):
+                    worker_info = get_worker_info()
+
+                    dataset = worker_info.dataset
+                    start = dataset.start
+                    end = dataset.end
+                    num_per_worker = int(
+                        math.ceil((end - start) / float(worker_info.num_workers)))
+
+                    worker_id = worker_info.id
+                    dataset.start = start + worker_id * num_per_worker
+                    dataset.end = min(dataset.start + num_per_worker, end)
+
+                dataloader = DataLoader(
+                    dataset,
+                    places=place,
+                    num_workers=2,
+                    batch_size=1,
+                    drop_last=True,
+                    worker_init_fn=worker_init_fn)
+
+                print(list(dataloader))
+                # outputs: [2, 5, 3, 6, 4, 7]
+
+    """
+
+    def __init__(self):
+        pass
+
+    def __iter__(self):
+        raise NotImplementedError("'{}' not implement in class "\
+                "{}".format('__iter__', self.__class__.__name__))
+
+    def __getitem__(self, idx):
+        raise RuntimeError("'{}' should not be called for IterableDataset" \
+                "{}".format('__getitem__', self.__class__.__name__))
+
+    def __len__(self):
+        raise RuntimeError("'{}' should not be called for IterableDataset" \
+                "{}".format('__len__', self.__class__.__name__))
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..001b8b931da233557c5d0b11af283a6e631788ae
--- /dev/null
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class _DatasetFetcher(object):
+    def __init__(self, dataset, collate_fn, drop_last):
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.drop_last = drop_last
+
+    def fetch(self, batch_indices):
+        raise NotImplementedError("'fetch' not implement for class {}".format(
+            self.__class__.__name__))
+
+
+class _IterableDatasetFetcher(_DatasetFetcher):
+    def __init__(self, dataset, collate_fn, drop_last):
+        super(_IterableDatasetFetcher, self).__init__(dataset, collate_fn,
+                                                      drop_last)
+        self.dataset_iter = iter(dataset)
+
+    def fetch(self, batch_indices):
+        data = []
+        for _ in batch_indices:
+            try:
+                data.append(next(self.dataset_iter))
+            except StopIteration:
+                break
+        if len(data) == 0 or (self.drop_last and
+                              len(data) < len(batch_indices)):
+            raise StopIteration
+
+        return self.collate_fn(data)
+
+
+class _MapDatasetFetcher(_DatasetFetcher):
+    def __init__(self, dataset, collate_fn, drop_last):
+        super(_MapDatasetFetcher, self).__init__(dataset, collate_fn, drop_last)
+
+    def fetch(self, batch_indices):
+        data = [self.dataset[idx] for idx in batch_indices]
+        return self.collate_fn(data)
diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c75fafe8b22380090ba6fb580777cdbe6570ad6
--- /dev/null
+++ b/python/paddle/fluid/dataloader/sampler.py
@@ -0,0 +1,236 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+
+__all__ = ["Sampler", "SequenceSampler", "RandomSampler"]
+
+
+class Sampler(object):
+    """
+    An abstract class to encapsulate methods and behaviors of samplers.
+
+    All sampler used by :code:`paddle.io.BatchSampler` should be a subclass
+    of :code:`paddle.io.Sampler`, BatchSampler subclasses should
+    implement following methods:
+
+    :code:`__iter__`: return sample index iterably, which iterate over indices
+    of dataset elements
+
+    :code:`__len__`: the number of sample in :attr:`data_source`
+
+
+    Args:
+        data_source(Dataset, optional): this could be an instance of
+                :code:`paddle.io.Dataset` other Python object which
+                implemented :code:`__len__` for Sampler to get indices
+                as the range of :attr:`dataset` length. Default None.
+
+    Returns:
+        Sampler: an iterable object for sample indices iterating
+
+    Examples:
+        
+        .. code-block:: python
+            
+            from paddle.io import Dataset, Sampler
+
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            class MySampler(Sampler):
+                def __init__(self, data_source):
+                    self.data_source = data_source
+
+                def __iter__(self):
+                    return iter(range(len(self.data_source)))
+
+                def __len__(self):
+                    return len(self.data_source)
+            
+            sampler = MySampler(data_source=RandomDataset(100))
+
+            for index in sampler:
+                print(index)
+
+    see `paddle.io.BatchSampler`
+    see `paddle.io.DataLoader`
+
+    """
+
+    def __init__(self, data_source=None):
+        self.data_source = data_source
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    # Not define __len__ method in this base class here for __len__
+    # is not needed in same sence, e.g. paddle.io.IterableDataset
+
+
+class SequenceSampler(Sampler):
+    """
+    Iterate samples sequentially, yield :code:`0, 1, 2, ..., len(data_source) -1`
+    generally,
+
+    Args:
+        data_source(Dataset): dataset to sample, this could be an
+                instance of :code:`paddle.io.Dataset` other Python
+                object which implemented :code:`__len__`.
+
+    Returns:
+        Sampler: a Sampler yield sample index sequentially
+
+    Examples:
+
+        .. code-block:: python
+            
+            from paddle.io import Dataset, SequenceSampler
+
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            sampler = SequenceSampler(data_source=RandomDataset(100))
+
+            for index in sampler:
+                print(index)
+
+    see `paddle.io.Sampler`
+    """
+
+    def __init__(self, data_source):
+        self.data_source = data_source
+
+    def __iter__(self):
+        return iter(range(len(self.data_source)))
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class RandomSampler(Sampler):
+    """
+    Iterate samples randomly, yield shuffled indices, if :attr:`replacement=False`,
+    yield shuffled indices of the whole data souce, if :attr:`replacement=True`,
+    :attr:`num_samples` can set to specify the sample number to draw.
+
+    Args:
+        data_source(Dataset): dataset to sample, this could be an
+                instance of :code:`paddle.io.Dataset` other Python
+                object which implemented :code:`__len__`.
+        replacement(bool): If False, sample the whole dataset, If False,
+                set :attr:`num_samples` for how many sample to draw. Default False.
+        num_samples(int): set sample number to draw if :attr:`replacement`
+                is True. Default None.
+        generator(Generator): specify a generator to sample the data source. Default None
+        
+    Returns:
+        Sampler: a Sampler yield sample index randomly
+
+    Examples:
+
+        .. code-block:: python
+            
+            from paddle.io import Dataset, RandomSampler
+
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            sampler = RandomSampler(data_source=RandomDataset(100))
+
+            for index in sampler:
+                print(index)
+
+    see `paddle.io.Sampler`
+    """
+
+    def __init__(self,
+                 data_source,
+                 replacement=False,
+                 num_samples=None,
+                 generator=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+
+        if not isinstance(self.replacement, bool):
+            raise TypeError("expect boolean value for replacement, but got "
+                            "replacement={}".format(self.replacement))
+
+        if self._num_samples is not None and not replacement:
+            raise ValueError(
+                "num_samples should not be specified while replacement is False")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer, "
+                             "but got num_samples={}".format(self.num_samples))
+
+    @property
+    def num_samples(self):
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        if self.generator:
+            for i in range(self.num_samples):
+                try:
+                    index = next(self.generator)
+                except StopIteration:
+                    return
+                yield index
+        else:
+            if self.replacement:
+                for index in np.random.choice(
+                        np.arange(n), self.num_samples, replace=True).tolist():
+                    yield index
+            else:
+                for index in np.random.choice(
+                        np.arange(n), n, replace=False).tolist():
+                    yield index
+
+    def __len__(self):
+        return self.num_samples
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 3831dee2964992f1cc035502cef12ac4967e0a72..4796cd5ada420567fa126154cc1ac28badc0f2c0 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -223,7 +223,8 @@ class DownpourSGD(DeviceWorker):
                     dense_table_set.add(i)
                 break
 
-        trainer_desc.device_worker_name = "DownpourWorker"
+        trainer_desc.device_worker_name = opt_info.get("worker_class",
+                                                       "DownpourWorker")
         pull_thread = trainer_desc.pull_dense_param
         pull_thread.device_num = trainer_desc.thread_num
         if opt_info.get("program_id_to_worker") is None:
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index f990d02342be78fe998cebfa40ed8b348cf54b2a..fc14e9b390e6ae4d695252f064f1f0697aaee258 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -56,6 +56,11 @@ from .dygraph_to_static import ProgramTranslator
 from . import rnn
 from .rnn import *
 
+from . import amp
+from .amp import *
+
+from .math_op_patch import monkey_patch_math_varbase
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
@@ -69,3 +74,4 @@ __all__ += jit.__all__
 __all__ += io.__all__
 __all__ += rnn.__all__
 __all__ += ['ProgramTranslator']
+__all__ += amp.__all__
diff --git a/python/paddle/imperative/jit/__init__.py b/python/paddle/fluid/dygraph/amp/__init__.py
similarity index 77%
rename from python/paddle/imperative/jit/__init__.py
rename to python/paddle/fluid/dygraph/amp/__init__.py
index 85fccf6e689ebf606092df8c3f94f561a68705ed..e86c5a20c5a411fda2a0011f63f4b5254e9bd07a 100644
--- a/python/paddle/imperative/jit/__init__.py
+++ b/python/paddle/fluid/dygraph/amp/__init__.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.dygraph.jit import save, load, SaveLoadConfig
-from ...fluid.dygraph.io import TranslatedLayer
+from . import auto_cast
+from .auto_cast import *
 
-__all__ = ['save', 'load', 'SaveLoadConfig']
+from . import loss_scaler
+from .loss_scaler import *
+
+__all__ = []
+__all__ += auto_cast.__all__
+__all__ += loss_scaler.__all__
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffb4d9f16f29f384b83f175ddcb60f65e8077930
--- /dev/null
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+from paddle.fluid import core
+import contextlib
+from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
+import warnings
+import copy
+
+__all__ = ['amp_guard']
+
+# The set of ops that support fp16 calculation and are considered numerically-
+# safe and performance-critical. These ops are always converted to fp16.
+WHITE_LIST = {
+    'conv2d',
+    'matmul',
+    'mul',
+}
+
+# The set of ops that support fp16 calculation and are considered numerically-
+# dangerous and whose effects may also be observed in downstream ops.
+BLACK_LIST = {
+    'exp',
+    'square',
+    'log',
+    'mean',
+    'sum',
+    'cos_sim',
+    'softmax',
+    'softmax_with_cross_entropy',
+    'sigmoid_cross_entropy_with_logits',
+    'cross_entropy',
+    'cross_entropy2',
+}
+
+AMP_RELATED_FLAGS = [
+    'FLAGS_cudnn_exhaustive_search',
+    'FLAGS_conv_workspace_size_limit',
+    'FLAGS_cudnn_batchnorm_spatial_persistent',
+]
+
+AMP_RELATED_FLAGS_SETTING = {
+    'FLAGS_cudnn_exhaustive_search': 1,
+    'FLAGS_conv_workspace_size_limit': 1000,
+    'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+}
+
+
+#NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
+# The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
+def _update_list(custom_white_list, custom_black_list):
+    """
+    Update black and white list according to users' custom list.
+    """
+    _white_list = copy.copy(WHITE_LIST)
+    _black_list = copy.copy(BLACK_LIST)
+    if custom_white_list and custom_black_list:
+        for op_name in custom_white_list:
+            if op_name in custom_black_list:
+                raise ValueError("Custom white list overlap "
+                                 "custom black list")
+    if custom_white_list:
+        for op_name in custom_white_list:
+            if op_name in _black_list:
+                _black_list.remove(op_name)
+            _white_list.add(op_name)
+    if custom_black_list:
+        for op_name in custom_black_list:
+            if op_name in _white_list:
+                _white_list.remove(op_name)
+            _black_list.add(op_name)
+    return _white_list, _black_list
+
+
+@signature_safe_contextmanager
+@dygraph_only
+def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
+    """
+    :api_attr: imperative
+
+    Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode.
+    If enabled, the input data type (float32 or float16) of each operator is decided 
+    by autocast algorithm for better performance. 
+    
+    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
+    imperative mode.
+
+    Args:
+        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
+        custom_white_list(set|list, optional): The custom white_list.
+        custom_black_list(set|list, optional): The custom black_list.
+        
+    Examples:
+
+     .. code-block:: python
+
+        import numpy as np
+        import paddle.fluid as fluid
+
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard():
+                conv = conv2d(data)
+                print(conv.dtype) # FP16
+            with fluid.dygraph.amp_guard(enable=False):
+                conv = conv2d(data)
+                print(conv.dtype) # FP32
+
+    """
+    tracer = _dygraph_tracer()
+    if not tracer:
+        raise ValueError(
+            "current_tracer is None, maybe it is not in imperative mode.")
+
+    if enable and not tracer._expected_place.is_gpu_place():
+        warnings.warn(
+            'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
+            % tracer._expected_place)
+        enable = False
+
+    # use default white_list and black_list if no custom lists provided
+    _white_list = WHITE_LIST
+    _black_list = BLACK_LIST
+    if custom_white_list or custom_black_list:
+        _white_list, _black_list = _update_list(custom_white_list,
+                                                custom_black_list)
+
+    if tracer:
+        # enable auto_cast
+        original_enable = tracer._enable_autocast
+        tracer._enable_autocast = enable
+        # set amp op list
+        original_white_list, original_black_list = tracer._get_amp_op_list()
+        tracer._set_amp_op_list(_white_list, _black_list)
+
+        # TODO(zhiqiu) set amp related flags automatically in this guard
+        # Currently, if FLAGS_cudnn_batchnorm_spatial_persistent is set True in amp_guard,
+        # batch_norm can run in fast mode, but batch_norm_grad can not if backward if not executed insise amp_guard.
+        # So, users need to set related flags manually.
+
+        # original_flags = get_flags(AMP_RELATED_FLAGS)
+        # set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    # restore status
+    try:
+        yield
+    finally:
+        if tracer:
+            tracer._enable_autocast = original_enable
+            tracer._set_amp_op_list(original_white_list, original_black_list)
+            # set_flags(original_flags)
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f3ca9ec007ef5c1ab8769dde741a5d2b3697600
--- /dev/null
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -0,0 +1,246 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from paddle.fluid import core
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import _varbase_creator, _dygraph_tracer, dygraph_only
+from paddle.fluid.data_feeder import check_type
+from ...wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+import warnings
+import numpy as np
+
+__all__ = ['AmpScaler']
+
+
+class AmpScaler(object):
+    """
+    :api_attr: imperative
+
+    AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
+    mode. It controls the scaling of loss, helps avoiding numerical overflow.
+    The object of this class has two methods `scale()`, `minimize()`.
+
+    `scale()` is used to multiply the loss by a scale ratio.
+    `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
+
+    Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in 
+    imperative mode.
+
+    Args:
+        enable(bool, optional): Enable loss scaling or not. Default is True.
+        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
+        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+                        scaling. Default is 2.0.
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+                        the loss scaling. Default is 0.5.
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+                                steps with finite gradients. Default is 1000.
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+                                    accumulated steps with nan or inf gradients. Default is 2.
+        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+    Returns:
+        An AmpScaler object.
+
+    Examples:
+
+     .. code-block:: python
+
+        import numpy as np
+        import paddle.fluid as fluid
+
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            model = fluid.dygraph.Conv2D(3, 2, 3)
+            optimizer = fluid.optimizer.SGDOptimizer(
+                    learning_rate=0.01, parameter_list=model.parameters())
+            scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard():
+                conv = model(data)
+                loss = fluid.layers.reduce_mean(conv)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.minimize(optimizer, scaled)         
+    """
+
+    @dygraph_only
+    def __init__(self,
+                 enable=True,
+                 init_loss_scaling=2.**15,
+                 incr_ratio=2.0,
+                 decr_ratio=0.5,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=1,
+                 use_dynamic_loss_scaling=True):
+
+        tracer = _dygraph_tracer()
+        if not tracer:
+            raise ValueError(
+                "current_tracer is None, maybe it is not in imperative mode.")
+
+        if enable and not tracer._expected_place.is_gpu_place():
+            warnings.warn(
+                'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
+                % tracer._expected_place)
+            enable = False
+
+        self._enable = enable
+
+        if self._enable:
+            assert incr_ratio > 1.0, "The incr_ratio must be > 1.0."
+            assert decr_ratio < 1.0, "The decr_ratio must be < 1.0."
+
+            self._init_loss_scaling = init_loss_scaling
+            self._incr_ratio = incr_ratio
+            self._decr_ratio = decr_ratio
+            self._incr_every_n_steps = incr_every_n_steps
+            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
+            self._incr_count = 0
+            self._decr_count = 0
+            self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
+
+            self._found_inf = to_variable(np.array([0]).astype(np.bool))
+            self._scale = to_variable(
+                np.array([self._init_loss_scaling]).astype(np.float32))
+            self._cache_founf_inf = None
+
+    def scale(self, var):
+        """
+        Multiplies a variable(Tensor) by the scale factor and returns scaled outputs.  
+        If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified.
+
+        Args:
+            var (Variable):  The variable to scale.
+        Returns:
+            The scaled variable or original variable.
+        
+        Examples:
+            .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+
+            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3)
+                optimizer = fluid.optimizer.SGDOptimizer(
+                        learning_rate=0.01, parameter_list=model.parameters())
+                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(data)
+                with fluid.dygraph.amp_guard():
+                    conv = model(data)
+                    loss = fluid.layers.reduce_mean(conv)
+                    scaled = scaler.scale(loss)
+                    scaled.backward()
+                    scaler.minimize(optimizer, scaled) 
+        """
+        check_type(var, "var", core.VarBase, 'AmpScaler.scale()')
+
+        if not self._enable:
+            return var
+
+        return var * self._scale
+
+    def minimize(self, optimizer, *args, **kwargs):
+        """
+        This function is similar as `Optimizer.minimize()`, which performs parameters updating.
+        
+        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
+        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+
+        Finally, the loss scaling ratio is updated.
+
+        Args:
+            optimizer(Optimizer):  The optimizer used to update parameters.
+            args:  Arguments, which will be forward to `optimizer.minimize()`.
+            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
+
+        Examples:
+            .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+
+            data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+            with fluid.dygraph.guard():
+                model = fluid.dygraph.Conv2D(3, 2, 3)
+                optimizer = fluid.optimizer.SGDOptimizer(
+                        learning_rate=0.01, parameter_list=model.parameters())
+                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(data)
+                with fluid.dygraph.amp_guard():
+                    conv = model(data)
+                    loss = fluid.layers.reduce_mean(conv)
+                    scaled = scaler.scale(loss)
+                    scaled.backward()
+                    scaler.minimize(optimizer, scaled) 
+        """
+        if not self._enable:
+            return optimizer.minimize(*args, **kwargs)
+
+        #  unscale the grad
+        self._unscale(optimizer)
+
+        optimize_ops, params_grads = (None, None)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            # uopdate the scale
+            self._update()
+
+        return optimize_ops, params_grads
+
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+        inv_scale = 1.0 / self._scale
+        param_grads = [
+            param._grad_ivar() for param in optimizer._parameter_list
+            if param._grad_ivar() is not None
+        ]
+        core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
+                                            self._found_inf)
+
+    def _update(self):
+        """
+        Updates the loss_scaling.
+        """
+        if not self._enable:
+            return
+
+        if self._cache_founf_inf:
+            self._incr_count = 0
+            self._decr_count = self._decr_count + 1
+            if self._decr_count == self._decr_every_n_nan_or_inf:
+                print(
+                    'Found inf or nan, current scale is: {}, decrease to: {}*{}'.
+                    format(
+                        float(self._scale),
+                        float(self._scale), float(self._decr_ratio)))
+                self._scale = self._scale * self._decr_ratio
+                self._decr_count = 0
+        else:
+            self._decr_count = 0
+            self._incr_count = self._incr_count + 1
+            if self._incr_count == self._incr_every_n_steps:
+                self._scale = self._scale * self._incr_ratio
+                self._incr_count = 0
+
+        return
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 7d972cbbd09b95e5d7476837cb3f3318526deed8..d4f1ca333945d8933a7a9df7ca93ea825e5cf110 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+import inspect
 import decorator
 import contextlib
-import functools
 import sys
 import numpy as np
 from paddle.fluid import core
@@ -26,13 +26,8 @@ import objgraph
 from ..data_feeder import convert_dtype
 
 __all__ = [
-    'no_grad',
-    'grad',
-    'guard',
-    'enable_dygraph',
-    'disable_dygraph',
-    'enabled',
-    'to_variable',
+    'no_grad', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', 'enabled',
+    'to_variable'
 ]
 
 
@@ -96,8 +91,8 @@ def enabled():
     """
     This function checks whether the program runs in dynamic graph mode or not.
     You can enter dynamic graph mode with :ref:`api_fluid_dygraph_guard` api,
-    or enable and disable dynamic graph mode with :ref:`api_fluid_dygraph_enable`
-    and :ref:`api_fluid_dygraph_disable` api .
+    or enable and disable dynamic graph mode with :ref:`api_fluid_dygraph_enable_dygraph`
+    and :ref:`api_fluid_dygraph_disable_dygraph` api .
 
     **Note**:
         ``fluid.dygraph.enabled`` is the alias of ``fluid.in_dygraph_mode``, and
@@ -121,10 +116,6 @@ def enabled():
 
 def enable_dygraph(place=None):
     """
-    :alias_main: paddle.enable_dygraph
-	:alias: paddle.enable_dygraph,paddle.enable_imperative.enable_dygraph
-	:old_api: paddle.fluid.dygraph.base.enable_dygraph
-
     This function enables dynamic graph mode.
 
     Parameters:
@@ -155,10 +146,6 @@ def enable_dygraph(place=None):
 
 def disable_dygraph():
     """
-    :alias_main: paddle.disable_dygraph
-	:alias: paddle.disable_dygraph,paddle.disable_imperative.disable_dygraph
-	:old_api: paddle.fluid.dygraph.base.disable_dygraph
-
     This function disables dynamic graph mode.
 
     return:
@@ -180,77 +167,82 @@ def disable_dygraph():
         _functional_dygraph_context_manager = None
 
 
-@signature_safe_contextmanager
-def _switch_tracer_mode_guard_(is_train=True):
-    tracer = framework._dygraph_tracer()
-    if tracer:
-        mode = tracer._train_mode
-        tracer._train_mode = is_train
-        try:
-            yield
-        finally:
-            tracer._train_mode = mode
-    else:
-        yield
-
-
-def no_grad(func=None):
+class no_grad:
     """
     :api_attr: imperative
 
     Create a context which disables dygraph gradient calculation.
-    In this mode, the result of every computation will have `stop_gradient=True`.
+    In this mode, the result of every computation will have `stop_gradient` set
+    to `True`.
 
-    Also functions as a decorator. (Make sure to instantiate without parenthesis.)
+    Also functions as a decorator. (Make sure to use an instance.)
 
     Examples:
 
      .. code-block:: python
 
         import numpy as np
-        import paddle.fluid as fluid
+        import paddle
+
+        paddle.disable_static()
 
         # use as generator
 
         data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with fluid.dygraph.guard():
-            l0 = fluid.Linear(2, 2)  # l0.weight.gradient() is None
-            l1 = fluid.Linear(2, 2)
-            with fluid.dygraph.no_grad():
-                # l1.weight.stop_gradient is False
-                tmp = l1.weight * 2  # tmp.stop_gradient is True
-            x = fluid.dygraph.to_variable(data)
-            y = l0(x) + tmp
-            o = l1(y)
-            o.backward()
-            print(tmp.gradient() is None)  # True
-            print(l0.weight.gradient() is None)  # False
+        l0 = paddle.nn.Linear(2, 2)  # l0.weight.gradient() is None
+        l1 = paddle.nn.Linear(2, 2)
+        with paddle.no_grad():
+            # l1.weight.stop_gradient is False
+            tmp = l1.weight * 2  # tmp.stop_gradient is True
+        x = paddle.to_tensor(data)
+        y = l0(x) + tmp
+        o = l1(y)
+        o.backward()
+        print(tmp.gradient() is None)  # True
+        print(l0.weight.gradient() is None)  # False
 
         # use as decorator
 
-        @fluid.dygraph.no_grad
+        @paddle.no_grad()
         def test_layer():
-            with fluid.dygraph.guard():
-                inp = np.ones([3, 1024], dtype='float32')
-                t = fluid.dygraph.base.to_variable(inp)
-                linear1 = fluid.Linear(1024, 4, bias_attr=False)
-                linear2 = fluid.Linear(4, 4)
-                ret = linear1(t)
-                dy_ret = linear2(ret)
+            inp = np.ones([3, 1024], dtype='float32')
+            t = paddle.to_tensor(inp)
+            linear1 = paddle.nn.Linear(1024, 4, bias_attr=False)
+            linear2 = paddle.nn.Linear(4, 4)
+            ret = linear1(t)
+            dy_ret = linear2(ret)
 
         test_layer()
-
     """
-    if func is None:
-        return _switch_tracer_mode_guard_(is_train=False)
-    else:
 
+    def __call__(self, func):
         @decorator.decorator
-        def __impl__(func, *args, **kwargs):
-            with _switch_tracer_mode_guard_(is_train=False):
+        def _decorate_function(func, *args, **kwargs):
+            with self:
                 return func(*args, **kwargs)
 
-        return __impl__(func)
+        @decorator.decorator
+        def _decorate_generator(func, *args, **kwargs):
+            gen = func(*args, **kwargs)
+            with self:
+                for x in gen:
+                    yield x
+
+        if inspect.isgeneratorfunction(func):
+            return _decorate_generator(func)
+        else:
+            return _decorate_function(func)
+
+    def __enter__(self):
+        tracer = framework._dygraph_tracer()
+        if tracer:
+            self.orig = tracer._train_mode
+            tracer._train_mode = False
+
+    def __exit__(self, *args):
+        tracer = framework._dygraph_tracer()
+        if tracer:
+            tracer._train_mode = self.orig
 
 
 @signature_safe_contextmanager
@@ -288,12 +280,11 @@ def guard(place=None):
     tracer = Tracer()
     VarBase = core.VarBase
 
-    if place is None:
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-    tracer._expected_place = place
+    if place is not None:
+        expected_place = place
+    else:
+        expected_place = framework._current_expected_place()
+    tracer._expected_place = expected_place
 
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
@@ -384,47 +375,46 @@ def grad(outputs,
     Examples 1:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.disable_static()
 
             def test_dygraph_grad(create_graph):
-                with fluid.dygraph.guard(): 
-                    x = fluid.layers.ones(shape=[1], dtype='float32') 
-                    x.stop_gradient = False
-                    y = x * x
-
-                    # Since y = x * x, dx = 2 * x 
-                    dx = fluid.dygraph.grad(
-                            outputs=[y],
-                            inputs=[x], 
-                            create_graph=create_graph, 
-                            retain_graph=True)[0]
-
-                    z = y + dx
-
-                    # If create_graph = False, the gradient of dx
-                    # would not be backpropagated. Therefore,
-                    # z = x * x + dx, and x.gradient() = 2 * x = 2.0
-                    
-                    # If create_graph = True, the gradient of dx
-                    # would be backpropagated. Therefore, 
-                    # z = x * x + dx = x * x + 2 * x, and
-                    # x.gradient() = 2 * x + 2 = 4.0 
-
-                    z.backward()
-                    return x.gradient() 
-
-            print(test_dygraph_grad(create_graph=False)) # [2.] 
+                x = paddle.ones(shape=[1], dtype='float32')
+                x.stop_gradient = False
+                y = x * x
+
+                # Since y = x * x, dx = 2 * x
+                dx = paddle.grad(
+                        outputs=[y],
+                        inputs=[x],
+                        create_graph=create_graph,
+                        retain_graph=True)[0]
+
+                z = y + dx
+
+                # If create_graph = False, the gradient of dx
+                # would not be backpropagated. Therefore,
+                # z = x * x + dx, and x.gradient() = 2 * x = 2.0
+
+                # If create_graph = True, the gradient of dx
+                # would be backpropagated. Therefore,
+                # z = x * x + dx = x * x + 2 * x, and
+                # x.gradient() = 2 * x + 2 = 4.0
+
+                z.backward()
+                return x.gradient()
+
+            print(test_dygraph_grad(create_graph=False)) # [2.]
             print(test_dygraph_grad(create_graph=True)) # [4.]
 
     Examples 2:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            fluid.enable_dygraph()
+            import paddle
+            paddle.disable_static()
 
             def test_dygraph_grad(grad_outputs=None):
-                x = fluid.layers.fill_constant(shape=[1], value=2.0, dtype='float32')
+                x = paddle.fill_constant(shape=[1], value=2.0, dtype='float32')
                 x.stop_gradient = False
 
                 y1 = x * x
@@ -440,27 +430,27 @@ def grad(outputs,
                 # Therefore, the final result would be:
                 # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.
 
-                dx = fluid.dygraph.grad(
+                dx = paddle.grad(
                     outputs=[y1, y2], 
                     inputs=[x],
                     grad_outputs=grad_outputs)[0]
 
                 return dx.numpy()
 
-            THREE = fluid.layers.fill_constant(shape=[1], value=3.0, dtype='float32')
-            FOUR = fluid.layers.fill_constant(shape=[1], value=4.0, dtype='float32')
+            grad_value = paddle.fill_constant(shape=[1], value=4.0, dtype='float32')
 
             # dy1 = [1], dy2 = [1]
             print(test_dygraph_grad(None)) # [7.]
 
             # dy1 = [1], dy2 = [4]
-            print(test_dygraph_grad([None, FOUR])) # [16.] 
+            print(test_dygraph_grad([None, grad_value])) # [16.]
 
             # dy1 = [4], dy2 = [1]
-            print(test_dygraph_grad([FOUR, None])) # [19.]
+            print(test_dygraph_grad([grad_value, None])) # [19.]
 
             # dy1 = [3], dy2 = [4]
-            print(test_dygraph_grad([THREE, FOUR])) # [24.]
+            grad_y1 = paddle.fill_constant(shape=[1], value=3.0, dtype='float32')
+            print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
 	'''
 
     def check_in_out(in_out_list, name):
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index de4330cf51669ebbbfb1ca7e9edcc0c82b1d0e72..82018132cc8b8600958e5cd52df5844e3d37638e 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -16,12 +16,13 @@ from __future__ import print_function
 
 import os
 import collections
-from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase
+from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
 import pickle
 import six
 from . import learning_rate_scheduler
 import warnings
 from .. import core
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars
 
 __all__ = [
     'save_dygraph',
@@ -140,22 +141,83 @@ def load_dygraph(model_path, keep_name_table=False):
     elif model_prefix.endswith(".pdopt"):
         model_prefix = model_prefix[:-6]
 
-    params_file_path = model_prefix + ".pdparams"
-    if not os.path.exists(params_file_path):
-        raise RuntimeError("Parameter file [ {} ] not exists".format(
-            params_file_path))
-
-    with open(params_file_path, 'rb') as f:
-        para_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
-
-    if not keep_name_table and "StructuredToParameterName@@" in para_dict:
-        del para_dict["StructuredToParameterName@@"]
+    para_dict = None
     opti_dict = None
+    params_file_path = model_prefix + ".pdparams"
     opti_file_path = model_prefix + ".pdopt"
-    if os.path.exists(opti_file_path):
-        with open(opti_file_path, 'rb') as f:
-            opti_dict = pickle.load(f) if six.PY2 else pickle.load(
-                f, encoding='latin1')
+    if not os.path.exists(params_file_path) and not os.path.exists(
+            opti_file_path):
+        # Load state dict by `jit.save` save format
+        # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
+        # The model saved by `save_inference_model` does not completely correspond to 
+        # the information required by the `state_dict` under the dygraph. 
+        # Although we reluctantly restore the `state_dict` in some scenarios, 
+        # this may not be complete and there are some limitations, so this function 
+        # will be considered later. The limitations include:
+        #   1. `save_inference_model` not save structured name, we need to remind 
+        # the user to configure the `use_structured_name` argument when `set_dict`, 
+        # but this argument is currently not public
+        #   2. if `save_inference_model` save all persistable variables in a single file,
+        # user need to give the variable name list to load `state_dict`
+
+        # 1. check model path
+        if not os.path.isdir(model_prefix):
+            raise ValueError("Model saved directory '%s' is not exists." %
+                             model_prefix)
+        # 2. load `__variables.info__`
+        var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
+        if not os.path.exists(var_info_path):
+            raise RuntimeError(
+                "No target can be loaded. Now only supports loading `state_dict` from "
+                "the result saved by `imperative.save` and `imperative.jit.save`."
+            )
+        with open(var_info_path, 'rb') as f:
+            extra_var_info = pickle.load(f)
+        # 3. load `__variables__`
+        # TODO(chenweihang): now only supports loading from default save format:
+        # - all persistable vars saved in one file named `__variables__`
+        # for other case, we may need to modify the arguments of this API
+        var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
+        if not os.path.exists(var_file_path):
+            raise RuntimeError(
+                "The parameter file to be loaded was not found. "
+                "Now only supports loading from the default save format, "
+                "and does not support custom params_filename and "
+                "save parameters separately.")
+        # 4. load all persistable vars
+        load_var_list = []
+        for name in sorted(extra_var_info):
+            var = _varbase_creator(name=name, persistable=True)
+            load_var_list.append(var)
+        _dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
+        # 5. construct state_dict
+        para_dict = dict()
+        for var in load_var_list:
+            structured_name = extra_var_info[var.name].get('structured_name',
+                                                           None)
+            if structured_name is None:
+                raise RuntimeError(
+                    "Cannot find saved variable (%s)'s structured name in saved model.",
+                    var.name)
+            para_dict[structured_name] = var.numpy()
+        # NOTE: `jit.save` doesn't save optimizer state
+    else:
+        # Load state dict by `save_dygraph` save format
+        if os.path.exists(params_file_path):
+            with open(params_file_path, 'rb') as f:
+                para_dict = pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')
+
+        if not keep_name_table and "StructuredToParameterName@@" in para_dict:
+            del para_dict["StructuredToParameterName@@"]
+
+        if os.path.exists(opti_file_path):
+            with open(opti_file_path, 'rb') as f:
+                opti_dict = pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')
 
     return para_dict, opti_dict
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index f859d40050c73d276bf9940d904c656debd35c82..8297f16f6081829029064600ad617044591bc256 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -38,7 +38,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
 
 __all__ = ['DygraphToStaticAst']
 
-DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func']
+DECORATOR_NAMES = ['declarative', 'to_static', 'dygraph_to_static_func']
 
 
 class DygraphToStaticAst(gast.NodeTransformer):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aba7ca0fdc0cfda5d79f5a66d78785df49c0baf
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import traceback
+
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginInfo, global_origin_info_map
+
+ERROR_DATA = "Error data about original source code information and traceback."
+
+
+def attach_error_data(error, in_runtime=False):
+    """
+    Attachs error data about original source code information and traceback to an error.
+
+    Args:
+        error(Exception): An native error.
+        in_runtime(bool): `error` is raised in runtime if in_runtime is True, otherwise in compile time
+    Returns:
+        An error attached data about original source code information and traceback.
+    """
+    e_type, e_value, e_traceback = sys.exc_info()
+    tb = traceback.extract_tb(e_traceback)[1:]
+
+    error_data = ErrorData(e_type, e_value, tb, global_origin_info_map)
+    error_data.in_runtime = in_runtime
+
+    setattr(error, ERROR_DATA, error_data)
+
+    remove_static_file()
+    return error
+
+
+def remove_static_file():
+    """
+    Removes temporary files created during the transformation of dygraph to static graph.
+    """
+    del_files = set()
+    for loc in global_origin_info_map:
+        static_filepath = loc[0]
+        del_files.add(static_filepath)
+
+        filename, extension = os.path.splitext(static_filepath)
+        del_files.add(filename + ".pyc")
+
+    for filepath in del_files:
+        if os.path.exists(filepath):
+            os.remove(filepath)
+
+
+class TraceBackFrame(OriginInfo):
+    """
+    Traceback frame information.
+    """
+
+    def __init__(self, location, function_name, source_code):
+        self.location = location
+        self.function_name = function_name
+        self.source_code = source_code
+
+
+class ErrorData(object):
+    """
+    Error data attached to an exception which is raised in un-transformed code.
+    """
+
+    def __init__(self, error_type, error_value, origin_traceback,
+                 origin_info_map):
+        self.error_type = error_type
+        self.error_value = error_value
+        self.origin_traceback = origin_traceback
+        self.origin_info_map = origin_info_map
+        self.in_runtime = False
+
+    def create_exception(self):
+        message = self.create_message()
+        new_exception = self.error_type(message)
+        setattr(new_exception, ERROR_DATA, self)
+        return new_exception
+
+    def create_message(self):
+        """
+        Creates a custom error message which includes trace stack with source code information of dygraph from user.
+        """
+        message_lines = []
+
+        # Step1: Adds header message to prompt users that the following is the original information.
+        header_message = "In user code:"
+        message_lines.append(header_message)
+        message_lines.append("")
+
+        # Simplify error value to improve readability if error is raised in runtime
+        if self.in_runtime:
+            self._simplify_error_value()
+            message_lines.append(str(self.error_value))
+            return '\n'.join(message_lines)
+
+        # Step2: Optimizes stack information with source code information of dygraph from user.
+        for filepath, lineno, funcname, code in self.origin_traceback:
+            loc = Location(filepath, lineno)
+
+            dygraph_func_info = self.origin_info_map.get(loc.line_location,
+                                                         None)
+            if dygraph_func_info:
+                # TODO(liym27): more information to prompt users that this is the original information.
+                # Replaces trace stack information about transformed static code with original dygraph code.
+                traceback_frame = self.origin_info_map[loc.line_location]
+            else:
+                traceback_frame = TraceBackFrame(loc, funcname, code)
+
+            message_lines.append(traceback_frame.formated_message())
+
+        # Step3: Adds error message like "TypeError: dtype must be int32, but received float32".
+        error_message = " " * 4 + traceback.format_exception_only(
+            self.error_type, self.error_value)[0].strip("\n")
+        message_lines.append(error_message)
+
+        return '\n'.join(message_lines)
+
+    def _simplify_error_value(self):
+        """
+        Simplifies error value to improve readability if error is raised in runtime.
+
+        NOTE(liym27): The op callstack information about transformed static code has been replaced with original dygraph code.
+
+        TODO(liym27):
+            1. Need a more robust way because the code of start_trace may change.
+            2. Set the switch to determine whether to simplify error_value
+        """
+        assert self.in_runtime is True
+
+        error_value_lines = str(self.error_value).split("\n")
+        error_value_lines_strip = [mes.lstrip(" ") for mes in error_value_lines]
+
+        start_trace = "outputs = static_func(*inputs)"
+        start_idx = error_value_lines_strip.index(start_trace)
+        error_value_lines = error_value_lines[start_idx + 1:]
+
+        error_value_str = '\n'.join(error_value_lines)
+        self.error_value = self.error_type(error_value_str)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 6b9ee9cbbe21b405b93a7ba2e86b39b47225196c..c66778992c25c68a5adf6a17ae0cd46b57b02fd6 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -39,32 +39,21 @@ GENERATE_VARIABLE_PREFIX = 'generate_variable'
 
 
 def create_while_node(condition_name, body_name, loop_var_names):
-    while_args = []
-    while_args.append(
-        gast.Name(
-            id=condition_name,
-            ctx=gast.Param(),
-            annotation=None,
-            type_comment=None))
-    while_args.append(
-        gast.Name(
-            id=body_name, ctx=gast.Param(), annotation=None, type_comment=None))
-    assign_targets = [
-        gast.Name(
-            id=var_name, ctx=gast.Param(), annotation=None, type_comment=None)
-        for var_name in loop_var_names
-    ]
-    while_args.append(gast.List(elts=assign_targets, ctx=gast.Param()))
-
-    while_func_id = gast.parse(
-        'fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop'
-    ).body[0].value
-    while_node = gast.Call(func=while_func_id, args=while_args, keywords=[])
-    assign_node = gast.Assign(
-        targets=[gast.Tuple(
-            elts=assign_targets, ctx=gast.Store())],
-        value=while_node)
-    return assign_node
+    # NOTE(liym27):
+    # It's better to parse the source code into an AST node than to customize an AST node
+    # including child nodes, because it is easy to mistake the ast node type when customizing the node.
+    #
+    # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name,
+    # but the type of `foo.x` gast.Attribute.
+
+    while_func_name = "fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop"
+    while_node_str = "[{}] = {}({}, {}, [{}])".format(
+        ",".join(loop_var_names), while_func_name, condition_name, body_name,
+        ",".join(loop_var_names))
+
+    while_node = gast.parse(while_node_str).body[0]
+
+    return while_node
 
 
 class NameVisitor(gast.NodeVisitor):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 429fa27f618765c78ad8b7e171b5b6341ed7335d..13f38b0726c27566ff0eda41d6c365e6a7e4aa4b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -18,9 +18,13 @@ import collections
 import inspect
 
 import gast
+from paddle.fluid import core
+from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
+from paddle.fluid.framework import Program
 
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
+ORIGI_INFO_MAP = "Original information map of source code."
 
 
 class Location(object):
@@ -64,6 +68,15 @@ class OriginInfo(object):
         return "{} \nsource_code: {}  in function {}\n  ".format(
             self.location, self.source_code, self.function_name)
 
+    def formated_message(self):
+        return '    File "{}", line {}, in {}\n\t{}'.format(
+            self.location.filepath, self.location.lineno, self.function_name,
+            self.source_code.lstrip())
+
+    def as_frame(self):
+        return (self.location.filepath, self.location.lineno,
+                self.function_name, self.source_code.lstrip())
+
 
 class OriginInfoAttacher(gast.NodeTransformer):
     """
@@ -119,7 +132,12 @@ class OriginInfoAttacher(gast.NodeTransformer):
         return self.col_offset + node.col_offset
 
 
-def create_origin_info_map(transformed_node, static_func):
+global_origin_info_map = {}
+
+
+def create_and_update_origin_info_map(transformed_node,
+                                      static_func,
+                                      is_global=True):
     """
     Creates a original information map between transformed static function and original dygraph function.
 
@@ -156,6 +174,10 @@ def create_origin_info_map(transformed_node, static_func):
 
         origin_info_map[static_loc] = dygraph_info
 
+    global_origin_info_map.update(origin_info_map)
+    if is_global:
+        return global_origin_info_map
+
     return origin_info_map
 
 
@@ -175,18 +197,6 @@ def attach_origin_info(ast_node, func):
     return ast_node
 
 
-# NOTE: inspect.unwrap() exits in PY3 but not in PY2.
-def unwrap(func):
-    def _is_wrapped(f):
-        return hasattr(f, '__wrapped__')
-
-    unwrapped_f = func
-    while (_is_wrapped(unwrapped_f)):
-        unwrapped_f = unwrapped_f.__wrapped__
-
-    return unwrapped_f
-
-
 def ast_walk(transformed_node, static_node):
     """
     Recursively yield all descendant nodes in the trees starting at transformed_node and static_node (including itself) in parallel.
@@ -234,3 +244,63 @@ def ast_walk(transformed_node, static_node):
                     if isinstance(d_item, gast.AST):
                         transformed_node_list.append(d_item)
                         static_node_list.append(s_item)
+
+
+def update_op_callstack_with_origin_info(program):
+    """
+    Replaces op callstack information about transformed static code with original dygraph code.
+    """
+
+    assert isinstance(program, Program)
+
+    def get_new_op_callstack(callstack):
+        """
+        An example of callstack:
+
+            File "path1/to/file.py", line 10, in func_1
+                y = fluid.layers.fill_constant(x, shape=[1], dtype="int32")
+            File "path2/to/file.py", line 740, in fill_constant
+                stop_gradient=True)
+            File "path3/to/file.py", line 43, in append_op
+              return self.main_program.current_block().append_op(*args, **kwargs)
+            File "path4/to/file.py", line 2811, in append_op
+              attrs=kwargs.get("attrs", None))
+            File "path5/to/file.py", line 1919, in __init__
+              for frame in traceback.extract_stack():
+        """
+
+        assert len(callstack) % 2 == 0
+        for i in range(0, len(callstack), 2):
+
+            file_line = callstack[i].lstrip(" ").split(",")
+
+            filepath = file_line[0][6:-1]
+            lineno = int(file_line[1][6:])
+            funcname = file_line[2][4:]
+            code = callstack[i + 1].lstrip(" ")
+
+            loc = Location(filepath, lineno)
+            dygraph_func_info = global_origin_info_map.get(loc.line_location)
+            if dygraph_func_info:
+                filepath, lineno, funcname, code = \
+                    dygraph_func_info.as_frame()
+
+            callstack[i] = '  File "{}", line {}, in {}'.format(
+                filepath, lineno, funcname)
+            callstack[i + 1] = '    {}'.format(code)
+
+        return callstack
+
+    op_maker = core.op_proto_and_checker_maker
+    callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+
+    for block in program.blocks:
+        for i, op in enumerate(block.ops):
+            if op.has_attr(callstack_var_name):
+                callstack = op.attr(callstack_var_name)
+
+                callstack = get_new_op_callstack(callstack)
+
+                op._set_attr(callstack_var_name, callstack)
+
+    return program
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 05fce7bf837664eafd89319eb6cdd973b745605f..7d2a767dd8f86fbf7e0908720d4d8a81a4885685 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -130,8 +130,6 @@ class PartialProgramLayer(layers.Layer):
         self._check_params_all_inited(main_program)
         # 2. Prune the parameters not used anywhere in the program.
         self._prune_unused_params(main_program)
-        # 3. Remove op's python call stack with redundant low-level error messages.
-        main_program = self._remove_op_call_stack(main_program)
 
         return main_program
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index 1b6b64ae1fdee89b8e7d9bfcb6601d27f76d10a5..d555c8ed28f358a43e53966dd30d76d85a03dde5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -47,8 +47,7 @@ class PrintTransformer(gast.NodeTransformer):
     # NOTE: deal with print in PY3
     def visit_Call(self, node):
         if isinstance(node.func, gast.Name) and node.func.id == 'print':
-            convert_print_node = self._create_print_node(node.args)
-            return gast.Expr(value=convert_print_node)
+            node = self._create_print_node(node.args)
         return node
 
     # NOTE: deal with print in PY2
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 79e812ff6192bc09e6e8c71397c6a239011dfae6..ceacba25375c64552e1e85d046ca494b078ee66d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,29 +13,38 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+
+import collections
 import inspect
-import warnings
 import textwrap
 import threading
-import collections
+import warnings
+
+import gast
 import numpy as np
-from paddle.fluid import core, scope_guard
-from paddle.fluid import framework
+from paddle.fluid import core
 from paddle.fluid import executor
+from paddle.fluid import framework
+from paddle.fluid import scope_guard
 from paddle.fluid import unique_name
+from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph import layers
-from paddle.fluid.layers.utils import flatten
-from paddle.fluid.layers.utils import pack_sequence_as
+from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
+from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
+from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
+from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
-from paddle.fluid.dygraph.base import param_guard
-from paddle.fluid.data_feeder import check_type
-from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
 
 __all__ = ['ProgramTranslator', 'convert_to_static']
 
@@ -86,17 +95,25 @@ class FunctionCache(object):
         """
         # Note: In Python2, it will raise OSError when inspect function
         # with decorator directly and function.__wrapped__ holds the actual function.
-        func = getattr(func, '__wrapped__', func)
+        func = unwrap(func)
         source_code = func_to_source_code(func)
+
+        # TODO(liym27):
+        #  Consider this case: source_code in self._code_to_ast_caches,
+        #  but actually they are methods in different classes.
+        #  Maybe use (__class__, source_code) as key
         if source_code in self._code_to_ast_caches:
             root_wrapper = self._code_to_ast_caches[source_code]
         else:
             root = gast.parse(source_code)
+            root = attach_origin_info(root, func)
             root_wrapper = self._dygraph_to_static.get_static_ast(root)
             self._code_to_ast_caches[source_code] = root_wrapper
 
         # Get static function from AST
         static_func, file_name = ast_to_func(root_wrapper.node, func)
+
+        create_and_update_origin_info_map(root_wrapper.node, static_func)
         return static_func
 
     def exist(self, func):
@@ -125,6 +142,7 @@ class FunctionSpec(object):
         self._args = args
         self._kwargs = kwargs
 
+        # TODO(liym27): func has multi layer decorator
         dyfunc = getattr(func, '__wrapped__', func)
         self._dyfunc_code = inspect.getsource(dyfunc)
 
@@ -282,11 +300,19 @@ class ConcreteProgram(object):
                 # 3. Builds program only once and returns the output Variables.
                 with param_guard(func_spec.parameters(False)), param_guard(
                         func_spec.buffers(False)):
-                    outputs = static_func(*inputs)
+                    try:
+                        outputs = static_func(*inputs)
+                    except BaseException as e:
+                        # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
+                        attach_error_data(e)
+                        raise
+
                 if not isinstance(outputs,
                                   (tuple, list)) and outputs is not None:
                     outputs = [outputs]
 
+        main_program = update_op_callstack_with_origin_info(main_program)
+
         return ConcreteProgram(
             inputs=inputs,
             outputs=outputs,
@@ -483,14 +509,24 @@ class ProgramTranslator(object):
             return dygraph_func(*args, **kwargs)
 
         function_spec = FunctionSpec(dygraph_func, args, kwargs)
-        _, partial_program_layer = self._program_cache[function_spec]
+        concrete_program, partial_program_layer = self._program_cache[
+            function_spec]
 
         if args and isinstance(args[0], layers.Layer):
             # Synchronize self.training attribute.
             partial_program_layer.training = args[0].training
             args = args[1:]
-
-        return partial_program_layer(args)
+        try:
+            return partial_program_layer(args)
+
+        except BaseException as e:
+            # NOTE:
+            # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
+            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
+            if not hasattr(e, ERROR_DATA):
+                # runtime error
+                attach_error_data(e, in_runtime=True)
+            raise
 
     def get_func(self, dygraph_func):
         """
@@ -639,7 +675,9 @@ class ProgramTranslator(object):
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_code"
         # Gets AST from dygraph function
-        raw_code = inspect.getsource(dygraph_func)
+
+        unwrap_func = unwrap(dygraph_func)
+        raw_code = inspect.getsource(unwrap_func)
         code = textwrap.dedent(raw_code)
         root = gast.parse(code)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index bb5b2843c92e2b1ed88b002bb1511c07ddd61f37..21e05bc6faf10110fae385b525e72e38a04da925 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -19,7 +19,6 @@ import astor
 import atexit
 import copy
 import gast
-import imp
 import inspect
 import os
 import six
@@ -28,6 +27,12 @@ import textwrap
 
 from paddle.fluid import unique_name
 
+# imp is deprecated in python3
+if six.PY2:
+    import imp
+else:
+    from importlib.machinery import SourceFileLoader
+
 dygraph_class_to_static_api = {
     "CosineDecay": "cosine_decay",
     "ExponentialDecay": "exponential_decay",
@@ -368,9 +373,15 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
     TODO: If only decorate one of inner function instead of decorating the main
     function, the other inner functions are invisible for the decorated function.
     """
+
+    def remove_if_exit(filepath):
+        if os.path.exists(filepath):
+            os.remove(filepath)
+
     source = ast_to_source_code(ast_root)
     import_fluid = "import paddle.fluid as fluid\n"
     source = import_fluid + source
+
     if six.PY2:
         source = source.encode('utf-8')
         f = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False)
@@ -382,8 +393,13 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
         f.write(source)
 
     if delete_on_exit:
-        atexit.register(lambda: os.remove(f.name))
-    module = imp.load_source(module_name, f.name)
+        atexit.register(lambda: remove_if_exit(f.name))
+        atexit.register(lambda: remove_if_exit(f.name[:-3] + ".pyc"))
+
+    if six.PY2:
+        module = imp.load_source(module_name, f.name)
+    else:
+        module = SourceFileLoader(module_name, f.name).load_module()
     func_name = dyfunc.__name__
     if not hasattr(module, func_name):
         raise ValueError(
@@ -1045,3 +1061,19 @@ class SplitAssignTransformer(gast.NodeTransformer):
             value_node = target
 
         return new_nodes
+
+
+# NOTE: inspect.unwrap() exits in PY3 but not in PY2.
+def unwrap(func):
+    """
+    Returns the object wrapped by decorators.
+    """
+
+    def _is_wrapped(f):
+        return hasattr(f, '__wrapped__')
+
+    unwrapped_f = func
+    while (_is_wrapped(unwrapped_f)):
+        unwrapped_f = unwrapped_f.__wrapped__
+
+    return unwrapped_f
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 38e4e517836ed8ddbeb36fb68a0c34fa9826f233..ba27b2d1c631428c07a5832125cdc18634e9a4b5 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -23,6 +23,7 @@ from paddle import compat as cpt
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid import backward
+from paddle.fluid import unique_name
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers import nn
 from paddle.fluid.dygraph.base import switch_to_static_graph
@@ -31,6 +32,9 @@ __all__ = ['TranslatedLayer']
 
 VARIABLE_FILENAME = "__variables__"
 EXTRA_VAR_INFO_FILENAME = "__variables.info__"
+LOADED_VAR_SUFFIX = "load"
+PARAMETER_NAME_PREFIX = "param"
+BUFFER_NAME_PREFIX = "buffer"
 
 
 def _load_program_desc(model_file_path):
@@ -107,33 +111,30 @@ def _get_all_var_names(program_desc):
     return all_var_names
 
 
+@switch_to_static_graph
 def _append_loaded_suffix(name):
     """
     Append loaded suffix to the given variable name
-    e.g. x ==> x@LOADED
+    e.g. x ==> x.load_0, x.load_0 ==> x.load_0.load_0
     """
-    suffix = core.loaded_var_suffix()
+    suffix = LOADED_VAR_SUFFIX
     name = cpt.to_text(name)
-    if suffix not in name:
-        name = name + suffix
-    return name
+    new_name = unique_name.generate_with_ignorable_key('.'.join((name, suffix)))
+    return new_name
 
 
-def _remove_loaded_suffix(name):
-    """
-    Remove loaded suffix to the given variable name
-    e.g. x@LOADED ==> x
-    """
-    suffix = core.loaded_var_suffix()
-    name = cpt.to_text(name)
-    return name.replace(suffix, '')
+@switch_to_static_graph
+def _generate_unique_var_name(prefix):
+    return unique_name.generate_with_ignorable_key(prefix)
 
 
 def _append_loaded_suffix_to_var(program_desc):
+    suffix_varname_dict = dict()
     persistable_vars = _get_persistable_vars(program_desc)
     for var_desc in persistable_vars:
         old_name = var_desc.name()
         new_name = _append_loaded_suffix(var_desc.name())
+        suffix_varname_dict[new_name] = old_name
         var_desc.set_name(new_name)
         for block_idx in six.moves.range(program_desc.num_blocks()):
             block = program_desc.block(block_idx)
@@ -141,6 +142,7 @@ def _append_loaded_suffix_to_var(program_desc):
                 op = block.op(op_idx)
                 op._rename_input(old_name, new_name)
                 op._rename_output(old_name, new_name)
+    return suffix_varname_dict
 
 
 @switch_to_static_graph
@@ -187,6 +189,9 @@ class _ProgramHolder(object):
         # execution scope
         self._inner_scope = core.Scope()
 
+        # append suffix var name dict
+        self._suffix_varname_dict = None
+
         # forward program
         self._infer_program_desc = self._preprocess(program_desc)
         # forward + backward program
@@ -272,7 +277,7 @@ class _ProgramHolder(object):
         self._append_scale_to_output(tmp_program)
 
         # 4. Persistable vars processing
-        # - append @LOADED suffix to persistable vars
+        # - append loaded suffix to persistable vars
         # NOTE: [why need to append suffix to persistable vars]
         # Dygraph and static graph mode use the same naming mechanism. 
         # If users want to load the model fine-tune, it is possible 
@@ -281,10 +286,7 @@ class _ProgramHolder(object):
         # and later after loading, a new linear is added. At this time, 
         # there will be a problem of duplicate names, so here is unified 
         # to add the LOADED suffix to the parameters of the model loaded
-        # during training. And in order to avoid multiple @LOADED suffix
-        # are appended to variable name, we only append @LOADED suffix to
-        # the variable that not contains @LOADED suffix.
-        _append_loaded_suffix_to_var(program_desc)
+        self._suffix_varname_dict = _append_loaded_suffix_to_var(program_desc)
         # - get persistable var
         self._persistable_names = _get_persistable_var_names(program_desc)
 
@@ -298,7 +300,7 @@ class _ProgramHolder(object):
             for i, out in enumerate(self._output_descs):
                 var = program.global_block().var(out.name())
                 var = nn.scale(
-                    var, 1., name="static_model_runner/scale_{}".format(i))
+                    var, 1., name="translated_layer/scale_{}".format(i))
                 scale_output_vars.append(var)
         # 2. update output names & descs
         for i, var in enumerate(scale_output_vars):
@@ -363,7 +365,7 @@ def _load_persistable_vars_by_program(model_path,
     persistable_vars = _get_persistable_vars(program_holder.infer_program)
     load_var_dict = {}
     for each_var in persistable_vars:
-        orig_each_name = _remove_loaded_suffix(each_var.name())
+        orig_each_name = program_holder._suffix_varname_dict[each_var.name()]
         if _is_parameter(each_var, program_holder.infer_program):
             # create output varbase
             new_var = framework.ParamBase(
@@ -421,20 +423,32 @@ def _load_persistable_vars_by_program(model_path,
 
 def _load_persistable_vars(model_path,
                            var_info_path,
+                           program_holder,
                            separate_params=False,
                            params_filename=None):
     # 1. load extra var info
     with open(var_info_path, 'rb') as f:
-        extra_var_info = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+        extra_var_info = pickle.load(f)
 
     # 2. construct var dict
     load_var_dict = dict()
     load_var_list = []
-    # NOTE: some var may not be Parameter
-    for name in sorted(extra_var_info):
-        # append suffix, see [why need to append suffix to persistable vars]
-        new_name = _append_loaded_suffix(name)
+    inv_suffix_varname_dict = {
+        value: key
+        for key, value in program_holder._suffix_varname_dict.items()
+    }
+
+    # NOTE(chenweihang): we need load persistable vars based the program,
+    # because the program may be pruned when `save_inference_model`, some
+    # var in `extra_var_info` may have been pruned 
+    for name in sorted(inv_suffix_varname_dict):
+        if name not in extra_var_info:
+            raise RuntimeError(
+                "The model to be loaded is not complete."
+                "The variable `%s` of program cannot be found in loaded model.",
+                name)
+        # get suffix var name, see [why need to append suffix to persistable vars]
+        new_name = inv_suffix_varname_dict[name]
         # create output varbase
         if extra_var_info[name].get('trainable', None) is not None:
             # use default shape and dtype
@@ -507,7 +521,8 @@ def _construct_params_and_buffers(model_path,
     var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
-                                          separate_params, params_filename)
+                                          programs['forward'], separate_params,
+                                          params_filename)
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
@@ -626,15 +641,29 @@ class TranslatedLayer(layers.Layer):
 
         self._program_holder_dict = programs
 
-        for name, var in persistable_vars.items():
-            if isinstance(var, framework.ParamBase):
-                self.add_parameter(name, var)
-            elif isinstance(var, core.VarBase):
-                self.register_buffer(name, var)
-            else:
-                raise TypeError(
-                    "Adding persistent variable which  to layer is not supported now"
-                )
+        # NOTE(chenweihang): [ why not use var name directly? ]
+        # When add parameter or buffer to Layer by follow apis,
+        # the variable name can't contain `.`, beccause which may cause
+        # AttributeError when access the newly added parameter or buffer
+        # in the form of `self.**.**``, but the ParamBase or BarBase
+        # name contains `.` originally, such as `linear_0.w_0`, so here
+        # need to generate new var name for each var
+        self._persistable_var_name_dict = dict()
+        # the TranslatedLayer object holded var names count started from 0
+        with unique_name.guard():
+            for name, var in persistable_vars.items():
+                if isinstance(var, framework.ParamBase):
+                    dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
+                    self._persistable_var_name_dict[name] = dy_name
+                    self.add_parameter(dy_name, var)
+                elif isinstance(var, core.VarBase):
+                    dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX)
+                    self._persistable_var_name_dict[name] = dy_name
+                    self.register_buffer(dy_name, var)
+                else:
+                    raise TypeError(
+                        "Adding persistent variable which  to layer is not supported now"
+                    )
 
         self._is_test = True
 
@@ -701,10 +730,11 @@ class TranslatedLayer(layers.Layer):
 
             persistable_vars = []
             for var_name in program_holder.persistable_names:
-                if var_name in self._parameters:
-                    persistable_vars.append(self._parameters[var_name])
-                elif var_name in self._buffers:
-                    persistable_vars.append(self._buffers[var_name])
+                dy_var_name = self._persistable_var_name_dict[var_name]
+                if dy_var_name in self._parameters:
+                    persistable_vars.append(self._parameters[dy_var_name])
+                elif dy_var_name in self._buffers:
+                    persistable_vars.append(self._buffers[dy_var_name])
                 else:
                     raise ValueError(
                         "The persistable variable %s is not exists in current TranslatedLayer."
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 64faae247fbf80637a45429eaa1d5833df122a1a..5a291df4700ad1336e8e850bd3674b1a8d9df979 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -15,20 +15,23 @@
 from __future__ import print_function
 
 import os
-import six
 import pickle
-
 import warnings
+
+import six
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
-from paddle.fluid.framework import Program, Block, Variable, ParamBase, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
+from paddle.fluid.framework import Block, ParamBase, Program, Variable
+from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dygraph_tracer
+from paddle.fluid.framework import dygraph_only, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
-from paddle.fluid.dygraph.io import TranslatedLayer, VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
 
 __all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
 
@@ -129,13 +132,16 @@ def _declarative_(dygraph_func):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
     @declarative handles the Program and Executor of static mode and returns
-    the result as a dygraph VarBase.
+    the result as dygraph Tensor(s). Users could use the returned dygraph
+    Tensor(s) to do imperative training, inference, or other operations. If the
+    decorated function calls other imperative function, the called one will be
+    converted into declarative function as well.
 
     Args:
         dygraph_func (callable): callable imperative function.
 
     Returns:
-        VarBase: containing the numerical result.
+        Tensor(s): containing the numerical result.
 
     Examples:
         .. code-block:: python
@@ -144,6 +150,7 @@ def _declarative_(dygraph_func):
           import numpy as np
           from paddle.fluid.dygraph.jit import declarative
 
+          fluid.enable_dygraph()
 
           @declarative
           def func(x):
@@ -167,7 +174,25 @@ def _declarative_(dygraph_func):
                 "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
-        return program_translator.get_output(dygraph_func, *args, **kwargs)
+        try:
+            return program_translator.get_output(dygraph_func, *args, **kwargs)
+        except Exception as e:
+            error_data = getattr(e, ERROR_DATA, None)
+            if error_data:
+                new_exception = error_data.create_exception()
+                if six.PY3:
+                    # NOTE(liym27):
+                    # 1. Why `raise new_exception from None`?
+                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+                    #   caught exception.
+                    # 2. Use exec to bypass syntax error checking in Python 2.
+
+                    six.exec_("raise new_exception from None")
+                else:
+                    raise new_exception
+            else:
+                raise
 
     return __impl__
 
@@ -680,11 +705,11 @@ def save(layer, model_path, input_spec=None, configs=None):
     prog_translator = ProgramTranslator()
     if not prog_translator.enable:
         raise RuntimeError(
-            "The paddle.imperative.jit.save doesn't work when setting ProgramTranslator.enable=False."
+            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable=False."
         )
     if not isinstance(layer, Layer):
         raise TypeError(
-            "The input layer of paddle.imperative.jit.save should be 'Layer', but received layer type is %s."
+            "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
     if configs is None:
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index f2e914a2137d0be0606556471696fd3d255b3c12..a904f80639752a7538289a1ce7c2abf378ccc634 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -136,18 +136,13 @@ class LayerObjectHelper(LayerHelperBase):
         return param
 
     # TODO: this should not be called anymore after all activation func move to Layers
-    def append_activation(self,
-                          input_var,
-                          act=None,
-                          use_cudnn=None,
-                          use_mkl_dnn=None):
+    def append_activation(self, input_var, act=None, use_cudnn=None):
         """Append activation
 
             Args:
                 input_var: the input variable. The len(input_var.shape) is
                 larger or equal than 2.
                 act: activation type
-                use_mkl_dnn: if use mkldnn
                 use_cudnn: if use cudnn
 
         Return the Variable of after append activation
@@ -163,8 +158,9 @@ class LayerObjectHelper(LayerHelperBase):
 
         if (use_cudnn is not None) and use_cudnn:
             act['use_cudnn'] = use_cudnn
-        if (use_mkl_dnn is not None) and use_mkl_dnn:
-            act['use_mkldnn'] = use_mkl_dnn
+        use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        if (use_mkldnn is not None) and use_mkldnn:
+            act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
 
         tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 72f105933dca919c8b3c2cbdf90318a5444d0866..1ef719b9da187be659d9c898ec996b5ad0c0d8a6 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -146,7 +146,7 @@ class Layer(core.Layer):
               import paddle
               import paddle.nn as nn
               
-              paddle.enable_imperative()
+              paddle.disable_static()
               
               net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
 
@@ -161,7 +161,7 @@ class Layer(core.Layer):
 
               print(net.state_dict())
         """
-        for layer in self.sublayers():
+        for layer in self.children():
             layer.apply(fn)
 
         fn(self)
@@ -283,7 +283,7 @@ class Layer(core.Layer):
     def create_parameter(self,
                          shape,
                          attr=None,
-                         dtype='float32',
+                         dtype=None,
                          is_bias=False,
                          default_initializer=None):
         """Create parameters for this layer.
@@ -353,6 +353,56 @@ class Layer(core.Layer):
         ]
         return ret
 
+    def children(self):
+        """Returns an iterator over immediate children layers.
+
+        Yields:
+            Layer: a child layer
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+
+                with fluid.dygraph.guard():
+                    fc1 = fluid.Linear(10, 3)
+                    fc2 = fluid.Linear(3, 10, bias_attr=False)
+                    model = fluid.dygraph.Sequential(fc1, fc2)
+                    
+                    layer_list = list(model.children())
+
+                    print(layer_list)
+
+        """
+        for _, layer in self.named_children():
+            yield layer
+
+    def named_children(self):
+        """Returns an iterator over immediate children layers, yielding both
+        the name of the layer as well as the layer itself.
+
+        Yields:
+            (string, Layer): Tuple containing a name and child layer
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+
+                with fluid.dygraph.guard():
+                    fc1 = fluid.Linear(10, 3)
+                    fc2 = fluid.Linear(3, 10, bias_attr=False)
+                    model = fluid.dygraph.Sequential(fc1, fc2)
+                    for prefix, layer in model.named_children():
+                        print(prefix, layer)
+
+        """
+        memo = set()
+        for name, layer in self._sub_layers.items():
+            if layer is not None and layer not in memo:
+                memo.add(layer)
+                yield name, layer
+
     def sublayers(self, include_sublayers=True):
         """Returns a list of sub layers.
 
@@ -503,7 +553,10 @@ class Layer(core.Layer):
                 "The name of buffer should be a string, but received {}.".
                 format(type(name).__name__))
         elif '.' in name:
-            raise KeyError("The name of buffer can not contain \".\"")
+            raise KeyError(
+                "The name of buffer can not contain `.`, "
+                "because when you access the newly added buffer in the "
+                "form of `self.**.**`, it will cause AttributeError.")
         elif name == '':
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
@@ -686,20 +739,38 @@ class Layer(core.Layer):
         Returns:
             Parameter: the parameter passed in.
         """
-        if parameter is None:
-            self._parameters[name] = None
-        elif not isinstance(parameter, framework.Parameter):
+        if '_parameters' not in self.__dict__:
+            raise RuntimeError(
+                "super(YourLayer, self).__init__() should be called firstly.")
+        elif not isinstance(name, six.string_types):
+            raise TypeError(
+                "The name of parameter should be a string, but received {}.".
+                format(type(name).__name__))
+        elif '.' in name:
+            raise KeyError(
+                "The name of parameter can not contain `.`, "
+                "because when you access the newly added parameter in the "
+                "form of `self.**.**`, it will cause AttributeError.")
+        elif name == '':
+            raise KeyError("The name of parameter can not be empty.")
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("The parameter '{}' already exists.".format(name))
+        elif parameter is not None and not isinstance(parameter,
+                                                      framework.Parameter):
             raise TypeError(
-                "parameter assignment requires Parameter or None, but got '{}'"
-                .format(type(parameter).__name__))
+                "The parameter to be added should be a Parameter, but received {}.".
+                format(type(parameter).__name__))
+        else:
+            if parameter is None:
+                self._parameters[name] = None
 
-        if len(self._loaddict_holder) > 0:
-            assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
-                parameter.name)
+            if len(self._loaddict_holder) > 0:
+                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name)
 
-            parameter.set_value(self._loaddict_holder[parameter.name])
+                parameter.set_value(self._loaddict_holder[parameter.name])
 
-        self._parameters[name] = parameter
+            self._parameters[name] = parameter
         return parameter
 
     def __getattr__(self, name):
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index d2c779a85497917179736777dac25efa7cfba228..bb55c6725e6a62f2cef393fd34b249c217be0c54 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -17,7 +17,9 @@ from __future__ import print_function
 from .. import core
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
+from ..layers import common_methods
 from . import to_variable, no_grad
+import paddle
 
 import numpy as np
 import six
@@ -30,6 +32,8 @@ _supported_int_dtype_ = [
     core.VarDesc.VarType.INT64,
 ]
 
+_already_patch_varbase = False
+
 
 def monkey_patch_math_varbase():
     """
@@ -37,7 +41,7 @@ def monkey_patch_math_varbase():
     The difference is, in dygraph mode, use auto-generated op functions for better performance.
     """
 
-    @no_grad
+    @no_grad()
     def create_tensor(value, dtype, shape):
         out = _varbase_creator(dtype=dtype)
         out = core.ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
@@ -140,25 +144,50 @@ def monkey_patch_math_varbase():
         else:
             return int(var.numpy().flatten()[0])
 
-    def _scalar_elementwise_add_(var, value):
+    @property
+    def _ndim_(var):
+        return len(var.shape)
+
+    def _scalar_add_(var, value):
         return _scalar_elementwise_op_(var, 1.0, value)
 
-    def _scalar_elementwise_sub_(var, value):
+    def _scalar_sub_(var, value):
         return _scalar_elementwise_op_(var, 1.0, -value)
 
-    def _scalar_elementwise_rsub_(var, value):
+    def _scalar_rsub_(var, value):
         return _scalar_elementwise_op_(var, -1.0, value)
 
-    def _scalar_elementwise_mul_(var, value):
+    def _scalar_mul_(var, value):
         return _scalar_elementwise_op_(var, value, 0.0)
 
-    def _scalar_elementwise_div_(var, value):
+    def _scalar_div_(var, value):
         return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
 
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
+    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
+    # for binary operator by using the api to achieve the type promotion
+    def _binary_method_creator_(op_type, reverse=False):
+        import paddle
+
+        def __impl__(self, other_var):
+            import paddle
+            op = getattr(paddle, op_type)
+            if reverse:
+                return op(other_var, self)
+            else:
+                return op(self, other_var)
+
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
+
+    # for binary operator such as elementwise, compare
+    def _binary_creator_(method_name,
+                         op_type,
+                         reverse=False,
+                         scalar_method=None):
         def __impl__(self, other_var):
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
@@ -200,60 +229,117 @@ def monkey_patch_math_varbase():
         __impl__.__doc__ = """
         {0}
         Args:
-            self(Variable): left hand variable
-            other_var(Variable|float|int): right hand variable
+            self(Tensor): left hand Tensor
+            other_var(Tensor|float|int): right hand Tensor
 
         Returns:
-            Variable
+            Tensor
         """.format(comment)
         __impl__.__name__ = method_name
         return __impl__
 
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-
-        setattr(core.VarBase, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    # b = -a
-    core.VarBase.__neg__ = _neg_
-    core.VarBase.__float__ = _float_
-    core.VarBase.__long__ = _long_
-    core.VarBase.__int__ = _int_
-    core.VarBase.__len__ = _len_
-    core.VarBase.__index__ = _index_
-    core.VarBase.astype = astype
-    """
-    When code is written like this
-    y = np.pi * var
-    ndarray.__mul__(self, var) is called, var will be traced as an array(by using __len__, __getitem__), which is not right.
-    when var.__array_ufunc__  is set to None, var.__rmul__(self,  np) will be called.
+    # Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
+    #  Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
+    def _method_creator_(op_type, arg_template=None):
+        def __impl__(self):
+            op = getattr(core.ops, op_type)
+            return op(self)
 
-    The details can be seen bellow:
-    https://docs.scipy.org/doc/numpy-1.13.0/neps/ufunc-overrides.html#behavior-in-combination-with-python-s-binary-operations
-    """
-    core.VarBase.__array_ufunc__ = None
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
+
+    varbase_methods = [
+        # Type1: From custom fun or lambda
+        ##   b=-a
+        ('__neg__', _neg_),
+        ('__float__', _float_),
+        ('__long__', _long_),
+        ('__int__', _int_),
+        ('__len__', _len_),
+        ('__index__', _index_),
+        ('astype', astype),
+        ('dim', lambda x: len(x.shape)),
+        ('ndimension', lambda x: len(x.shape)),
+        ('ndim', _ndim_),
+        ('size', lambda x: x.shape),
+        # Type2: From Template that create core.ops automatically. It's recommended.
+        ('__add__',
+         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
+        ##  a+b == b+a. Do not need to reverse explicitly
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+                                     _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+                                      _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+                                     _scalar_mul_)),
+        ## a*b == b*a. Do not need to reverse explicitly
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
+                                          None)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        # These binary use paddle.optype
+        ('__div__', _binary_method_creator_('divide', False)),
+        ('__truediv__', _binary_method_creator_('divide', False)),
+        ('__rtruediv__', _binary_method_creator_('divide', True)),
+        ('__rdiv__', _binary_method_creator_('divide', True)),
+        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
+        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
+        ('__mod__', _binary_method_creator_('remainder', False)),
+        ## for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
+        ('__array_ufunc__', None),
+        ('sigmoid', _method_creator_('sigmoid', 'name=None')),
+        ('logsigmoid', _method_creator_('logsigmoid', 'name=None')),
+        ('exp', _method_creator_('exp', 'name=None')),
+        ('tanh', _method_creator_('tanh', 'name=None')),
+        ('atan', _method_creator_('atan', 'name=None')),
+        ('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
+        ('sqrt', _method_creator_('sqrt', 'name=None')),
+        ('rsqrt', _method_creator_('rsqrt', 'name=None')),
+        ('abs', _method_creator_('abs', 'name=None')),
+        ('ceil', _method_creator_('ceil', 'name=None')),
+        ('floor', _method_creator_('floor', 'name=None')),
+        ('cos', _method_creator_('cos', 'name=None')),
+        ('acos', _method_creator_('acos', 'name=None')),
+        ('asin', _method_creator_('asin', 'name=None')),
+        ('sin', _method_creator_('sin', 'name=None')),
+        ('sinh', _method_creator_('sinh', 'name=None')),
+        ('cosh', _method_creator_('cosh', 'name=None')),
+        ('round', _method_creator_('round', 'name=None')),
+        ('reciprocal', _method_creator_('reciprocal', 'name=None')),
+        ('square', _method_creator_('square', 'name=None')),
+        ('softplus', _method_creator_('softplus', 'name=None')),
+        ('softsign', _method_creator_('softsign', 'name=None')),
+        # Type3: Form module 'paddle.tensor' defaultly.
+        #   It's not a goodway, because it will increase call time.
+    ]
+
+    global _already_patch_varbase
+    if not _already_patch_varbase:
+        for method in varbase_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(core.VarBase, method_name, method_impl)
+    else:
+        import paddle.tensor
+        for method_name in common_methods:
+            if hasattr(core.VarBase, method_name): continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl: setattr(core.VarBase, method_name, method_impl)
+
+    _already_patch_varbase = True
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index cc2b746b0c1e9a00c21ebe6762ba4da38d20c511..dc3403358b6af25d5da001282fffe53be8bfd3d9 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 from six.moves import reduce
 from .. import core
 from ..layers import utils
@@ -35,7 +36,7 @@ __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
     'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
     'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
-    'SpectralNorm', 'TreeConv'
+    'SpectralNorm', 'TreeConv', 'Flatten'
 ]
 
 
@@ -180,6 +181,7 @@ class Conv2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
         self._use_cudnn = use_cudnn
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
         self._filter_size = filter_size
         self._num_filters = num_filters
         self._param_attr = param_attr
@@ -187,7 +189,8 @@ class Conv2D(layers.Layer):
         self._dtype = dtype
 
         if (self._num_channels == self._groups and
-                num_filters % self._num_channels == 0 and not self._use_cudnn):
+                num_filters % self._num_channels == 0 and
+                not self._use_cudnn and not self._use_mkldnn):
             self._l_type = 'depthwise_conv2d'
         else:
             self._l_type = 'conv2d'
@@ -224,14 +227,15 @@ class Conv2D(layers.Layer):
         if in_dygraph_mode() and self._l_type == 'conv2d':
             attrs = ('strides', self._stride, 'paddings', self._padding,
                      'dilations', self._dilation, 'groups', self._groups
-                     if self._groups else 1, 'use_cudnn', self._use_cudnn)
+                     if self._groups else 1, 'use_cudnn', self._use_cudnn,
+                     'use_mkldnn', self._use_mkldnn)
             out = core.ops.conv2d(input, self.weight, *attrs)
             pre_bias = out
 
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, self.bias,
-                                                            1)
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               self._act)
+            pre_act = dygraph_utils._append_bias_in_dygraph(
+                pre_bias, self.bias, 1, use_mkldnn=self._use_mkldnn)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, self._act, use_mkldnn=self._use_mkldnn)
         inputs = {
             'Input': [input],
             'Filter': [self.weight],
@@ -242,7 +246,7 @@ class Conv2D(layers.Layer):
             'dilations': self._dilation,
             'groups': self._groups if self._groups else 1,
             'use_cudnn': self._use_cudnn,
-            'use_mkldnn': False,
+            'use_mkldnn': self._use_mkldnn,
         }
 
         check_variable_and_dtype(input, 'input',
@@ -267,7 +271,8 @@ class Conv2D(layers.Layer):
                 inputs={'X': [pre_bias],
                         'Y': [self.bias]},
                 outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
+                attrs={'axis': 1,
+                       'use_mkldnn': self._use_mkldnn})
         else:
             pre_act = pre_bias
 
@@ -828,6 +833,8 @@ class Pool2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
                 "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
@@ -853,8 +860,8 @@ class Pool2D(layers.Layer):
                      'global_pooling', self._global_pooling, 'strides',
                      self._pool_stride, 'paddings', self._pool_padding,
                      'use_cudnn', self._use_cudnn, 'ceil_mode', self._ceil_mode,
-                     'use_mkldnn', False, 'exclusive', self._exclusive,
-                     'data_format', self._data_format)
+                     'use_mkldnn', self._use_mkldnn, 'exclusive',
+                     self._exclusive, 'data_format', self._data_format)
             return core.ops.pool2d(input, *attrs)
 
         check_variable_and_dtype(
@@ -869,7 +876,7 @@ class Pool2D(layers.Layer):
             "paddings": self._pool_padding,
             "use_cudnn": self._use_cudnn,
             "ceil_mode": self._ceil_mode,
-            "use_mkldnn": False,
+            "use_mkldnn": self._use_mkldnn,
             "exclusive": self._exclusive,
             "data_format": self._data_format,
         }
@@ -958,16 +965,22 @@ class Linear(layers.Layer):
         self.bias = self.create_parameter(
             shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True)
 
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+
     def forward(self, input):
         if in_dygraph_mode():
             pre_bias = _varbase_creator(dtype=input.dtype)
             core.ops.matmul(input, self.weight, pre_bias, 'transpose_X', False,
-                            'transpose_Y', False, "alpha", 1)
+                            'transpose_Y', False, "alpha", 1, "use_mkldnn",
+                            self._use_mkldnn)
             pre_act = dygraph_utils._append_bias_in_dygraph(
-                pre_bias, self.bias, axis=len(input.shape) - 1)
+                pre_bias,
+                self.bias,
+                axis=len(input.shape) - 1,
+                use_mkldnn=self._use_mkldnn)
 
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               self._act)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, self._act, use_mkldnn=self._use_mkldnn)
 
         check_variable_and_dtype(input, 'input',
                                  ['float16', 'float32', 'float64'], "Linear")
@@ -976,6 +989,7 @@ class Linear(layers.Layer):
             "transpose_X": False,
             "transpose_Y": False,
             "alpha": 1,
+            "use_mkldnn": self._use_mkldnn,
         }
         inputs = {"X": [input], "Y": [self.weight]}
 
@@ -990,7 +1004,10 @@ class Linear(layers.Layer):
                 inputs={'X': [tmp],
                         'Y': [self.bias]},
                 outputs={'Out': [pre_activation]},
-                attrs={'axis': len(input.shape) - 1})
+                attrs={
+                    'axis': len(input.shape) - 1,
+                    'use_mkldnn': self._use_mkldnn
+                })
         else:
             pre_activation = tmp
         return self._helper.append_activation(pre_activation, act=self._act)
@@ -1250,6 +1267,7 @@ class BatchNorm(layers.Layer):
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._act = act
+        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
@@ -1314,8 +1332,8 @@ class BatchNorm(layers.Layer):
         if in_dygraph_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
-                     self._fuse_with_relu, "use_global_stats",
+                     self._data_layout, "use_mkldnn", self._use_mkldnn,
+                     "fuse_with_relu", self._fuse_with_relu, "use_global_stats",
                      self._use_global_stats, 'trainable_statistics',
                      self._trainable_statistics)
             batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
@@ -1323,7 +1341,7 @@ class BatchNorm(layers.Layer):
                 mean_out, variance_out, *attrs)
 
             return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act)
+                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn)
 
         check_variable_and_dtype(input, 'input',
                                  ['float16', 'float32', 'float64'], 'BatchNorm')
@@ -2308,7 +2326,8 @@ class PRelu(layers.Layer):
             #NOTE(zhiqiu): The _alpha_shape should be [1, channel] + [1] * len(input_shape[2:]), not [1, channel, 1, 1].
             # However, the suffix 1 in the list is useless, since the tensor is viewed as one demension array during kernel calculation. 
             # And, input_shape is not required when mode is 'channel', so it is simplified.
-            self._alpha_shape = [1, channel]
+            #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+            self._alpha_shape = [1, channel, 1, 1]
         elif mode == 'element':
             assert isinstance(input_shape, (
                 list, tuple
@@ -3182,3 +3201,49 @@ class TreeConv(layers.Layer):
         else:
             pre_activation = out
         return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class Flatten(layers.Layer):
+    """
+    :alias_main: paddle.nn.Flatten
+    :alias: paddle.nn.Flatten,paddle.nn.layer.Flatten,paddle.nn.layer.common.Flatten
+    This interface is used to construct a callable object of the ``FLatten`` class.
+    For more details, refer to code examples.
+    It implements flatten a contiguous range of dims into a tensor.
+
+    Equation:
+
+    Parameters:
+        start_axis(int): first dim to flatten (default = 1)
+        stop_axis(int): last dim to flatten (default = -1).
+    
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          from paddle import to_variable
+          import numpy as np
+
+          inp_np = np.ones([5, 2, 3, 4]).astype('float32')
+          
+          paddle.disable_static()
+          
+          inp_np = to_variable(inp_np)
+          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
+          flatten_res = flatten(inp_np)
+
+    """
+
+    def __init__(self, start_axis=1, stop_axis=-1):
+        super(Flatten, self).__init__()
+        self.start_axis = start_axis
+        self.stop_axis = stop_axis
+
+    def forward(self, input):
+        out = paddle.tensor.manipulation.flatten(
+            input, start_axis=self.start_axis, stop_axis=self.stop_axis)
+        return out
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 804076f608e714b4c2623bfb580bfe09e42c8db2..54d2cda4ca6858c46140e1fbf6ac8860c3a7c78d 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -242,41 +242,38 @@ class DataParallel(layers.Layer):
     Examples:
         .. code-block:: python
 
-           import numpy as np
-           import paddle.fluid as fluid
-           import paddle.fluid.dygraph as dygraph
-           from paddle.fluid.optimizer import AdamOptimizer
-           from paddle.fluid.dygraph.nn import Linear
-           from paddle.fluid.dygraph.base import to_variable
+            import numpy as np
+            import paddle.fluid as fluid
 
-           place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-           with fluid.dygraph.guard(place=place):
+            place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+            with fluid.dygraph.guard(place):
 
-               # prepare the data parallel context
-               strategy=dygraph.prepare_context()
+                # prepare the data parallel context
+                strategy = fluid.dygraph.prepare_context()
 
-               linear = Linear(1, 10, act="softmax")
-               adam = fluid.optimizer.AdamOptimizer()
+                linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                adam = fluid.optimizer.AdamOptimizer(
+                    learning_rate=0.001, parameter_list=linear.parameters())
 
-               # make the module become the data parallelism module
-               linear = dygraph.DataParallel(linear, strategy)
+                # make the module become the data parallelism module
+                linear = fluid.dygraph.DataParallel(linear, strategy)
 
-               x_data = np.random.random(size=[10, 1]).astype(np.float32)
-               data = to_variable(x_data)
+                x_data = np.random.random(size=[10, 1]).astype(np.float32)
+                data = fluid.dygraph.to_variable(x_data)
 
-               hidden = linear(data)
-               avg_loss = fluid.layers.mean(hidden)
+                hidden = linear(data)
+                avg_loss = fluid.layers.mean(hidden)
 
-               # scale the loss according to the number of trainers.
-               avg_loss = linear.scale_loss(avg_loss)
+                # scale the loss according to the number of trainers.
+                avg_loss = linear.scale_loss(avg_loss)
 
-               avg_loss.backward()
+                avg_loss.backward()
 
-               # collect the gradients of trainers.
-               linear.apply_collective_grads()
+                # collect the gradients of trainers.
+                linear.apply_collective_grads()
 
-               adam.minimize(avg_loss)
-               linear.clear_gradients()
+                adam.minimize(avg_loss)
+                linear.clear_gradients()
     """
 
     def __init__(self, layers, strategy):
@@ -306,20 +303,23 @@ class DataParallel(layers.Layer):
 
                 import numpy as np
                 import paddle.fluid as fluid
-                import paddle.fluid.dygraph as dygraph
-                from paddle.fluid.optimizer import AdamOptimizer
-                from paddle.fluid.dygraph.nn import Linear
-                from paddle.fluid.dygraph.base import to_variable
-
-                place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place=place):
-                    strategy=dygraph.prepare_context()
-                    linear = Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer()
-                    linear = dygraph.DataParallel(linear, strategy)
+
+                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+                with fluid.dygraph.guard(place):
+
+                    # prepare the data parallel context
+                    strategy = fluid.dygraph.prepare_context()
+
+                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                    adam = fluid.optimizer.AdamOptimizer(
+                        learning_rate=0.001, parameter_list=linear.parameters())
+
+                    # make the module become the data parallelism module
+                    linear = fluid.dygraph.DataParallel(linear, strategy)
 
                     x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = to_variable(x_data)
+                    data = fluid.dygraph.to_variable(x_data)
+
                     hidden = linear(data)
                     avg_loss = fluid.layers.mean(hidden)
 
@@ -327,6 +327,8 @@ class DataParallel(layers.Layer):
                     avg_loss = linear.scale_loss(avg_loss)
 
                     avg_loss.backward()
+
+                    # collect the gradients of trainers.
                     linear.apply_collective_grads()
 
                     adam.minimize(avg_loss)
@@ -380,7 +382,7 @@ class DataParallel(layers.Layer):
                 self._reshape_inplace(x=g_var, shape=g_shape)
                 assert g_var.shape == g_shape
 
-    @no_grad
+    @no_grad()
     def apply_collective_grads(self):
         """
         AllReduce the Parameters' gradient.
@@ -390,23 +392,29 @@ class DataParallel(layers.Layer):
 
                 import numpy as np
                 import paddle.fluid as fluid
-                import paddle.fluid.dygraph as dygraph
-                from paddle.fluid.optimizer import AdamOptimizer
-                from paddle.fluid.dygraph.nn import Linear
-                from paddle.fluid.dygraph.base import to_variable
-
-                place = place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place=place):
-                    strategy=dygraph.prepare_context()
-                    linear = Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer()
-                    linear = dygraph.DataParallel(linear, strategy)
+
+                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
+                with fluid.dygraph.guard(place):
+
+                    # prepare the data parallel context
+                    strategy = fluid.dygraph.prepare_context()
+
+                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
+                    adam = fluid.optimizer.AdamOptimizer(
+                        learning_rate=0.001, parameter_list=linear.parameters())
+
+                    # make the module become the data parallelism module
+                    linear = fluid.dygraph.DataParallel(linear, strategy)
 
                     x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = to_variable(x_data)
+                    data = fluid.dygraph.to_variable(x_data)
+
                     hidden = linear(data)
                     avg_loss = fluid.layers.mean(hidden)
+
+                    # scale the loss according to the number of trainers.
                     avg_loss = linear.scale_loss(avg_loss)
+
                     avg_loss.backward()
 
                     # collect the gradients of trainers.
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index d509fcc38e771bf5a5bacb63602966a871c7c885..9dbaab2580d21397fa7a4e03b03a5f1c4ac887f2 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -50,14 +50,19 @@ def monkey_patch_varbase():
                     static_var = var_base._to_static_var()
 
         """
+
+        # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. 
+        # It will fail. So, for propery in dygraph only, should not let it getattr(self, attr, None).
+        attr_not_need_keys = ['grad']
         if isinstance(self, ParamBase):
             attr_kwargs = self.__dict__.copy()
         else:
-            attr_names = [
-                name for name in dir(self)
-                if not (inspect.ismethod(getattr(self, name)) or
-                        name.startswith('_'))
-            ]
+            attr_names = []
+            for name in dir(self):
+                if name not in attr_not_need_keys and not (
+                        inspect.ismethod(getattr(self, name)) or
+                        name.startswith('_')):
+                    attr_names.append(name)
             attr_kwargs = {name: getattr(self, name) for name in attr_names}
 
         attr_keys = ['block', 'shape', 'dtype', 'type', 'name', 'persistable']
@@ -216,6 +221,14 @@ def monkey_patch_varbase():
         else:
             return np.array(new_ivar.value().get_tensor())
 
+    @property
+    def grad(self):
+        """
+        The alias of gradient().
+        """
+
+        return self.gradient()
+
     def __str__(self):
         """
         Convert a VarBase object to a readable string.
@@ -226,7 +239,7 @@ def monkey_patch_varbase():
             .. code-block:: python
 
                 import paddle
-                paddle.enable_imperative()
+                paddle.disable_static()
                 x = paddle.rand([1, 5])
                 print(x)
                 # Variable: eager_tmp_0
@@ -235,13 +248,13 @@ def monkey_patch_varbase():
                 #   - layout: NCHW
                 #   - dtype: float
                 #   - data: [0.645307 0.597973 0.732793 0.646921 0.540328]
-                paddle.disable_imperative()
+                paddle.enable_static()
         """
         tensor = self.value().get_tensor()
         if tensor._is_initialized():
-            return 'Variable: %s\n%s' % (self.name, str(tensor))
+            return 'Tensor: %s\n%s' % (self.name, str(tensor))
         else:
-            return 'Variable: %s, not initialized' % (self.name)
+            return 'Tensor: %s, not initialized' % (self.name)
 
     @property
     def block(self):
@@ -260,8 +273,9 @@ def monkey_patch_varbase():
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
-        ("block", block), ("backward", backward), ("gradient", gradient),
-        ("__str__", __str__)):
+        ("block", block), ("backward", backward), ("grad", grad),
+        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
+        ("__module__", "paddle"), ("__name__", "Tensor")):
         setattr(core.VarBase, method_name, method)
 
     # patch math methods for varbase
diff --git a/python/paddle/fluid/dygraph_utils.py b/python/paddle/fluid/dygraph_utils.py
index 7b559494e6c3b779983e54f5f9675170ef985f63..a2338b874f51a209cf941d8c08d5995db4054968 100644
--- a/python/paddle/fluid/dygraph_utils.py
+++ b/python/paddle/fluid/dygraph_utils.py
@@ -45,17 +45,19 @@ def _append_activation_in_dygraph(input,
 
 
 @dygraph_only
-def _append_bias_in_dygraph(input, bias=None, axis=1):
+def _append_bias_in_dygraph(input, bias=None, axis=1, use_mkldnn=False):
     """Append bias operation in dygraph mode.
 
         Args:
             input: the input variable. 
             bias:  the bias to be appended
             axis:  the axis to perform operation
+            use_mkldnn: whether to use mkldnn
 
     Return the Variable after bias operation
     """
     if bias is None:
         return input
 
-    return core.ops.elementwise_add(input, bias, 'axis', axis)
+    return core.ops.elementwise_add(input, bias, 'axis', axis, 'use_mkldnn',
+                                    use_mkldnn)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9b22a016baa9cdc54cabb1d305518649c02b6546..2e3f34f41648a9343b4bccd1044bcd3f7b3d8189 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -25,11 +25,14 @@ import six
 from .data_feeder import convert_dtype
 from .framework import Program, default_main_program, Variable, Operator, convert_np_dtype_to_dtype_
 from . import core
+from . import unique_name
 from . import compiler
 from .. import compat as cpt
 from .trainer_factory import TrainerFactory
 from .trainer_factory import FetchHandlerMonitor
 import copy
+from . import framework
+from .incubate.checkpoint import auto_checkpoint as acp
 
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 
@@ -107,7 +110,7 @@ def scope_guard(scope):
         _switch_scope(ex)
 
 
-def as_numpy(tensor):
+def as_numpy(tensor, copy=False):
     """
     Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
     For higher dimensional sequence data, please use LoDTensor directly.
@@ -126,6 +129,7 @@ def as_numpy(tensor):
 
     Args:
        tensor(Variable): a instance of Tensor
+       copy(bool, optional): Whether to use deep copy.
 
     Returns:
         numpy.ndarray
@@ -142,7 +146,10 @@ def as_numpy(tensor):
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
     if tensor._is_initialized():
-        return np.array(tensor)
+        if copy:
+            return np.array(tensor)
+        else:
+            return np.asarray(tensor)
     else:
         return None
 
@@ -347,7 +354,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
         " program.")
     tensor = var.get_tensor()
     if return_numpy:
-        tensor = as_numpy(tensor)
+        tensor = as_numpy(tensor, copy=True)
     return tensor
 
 
@@ -542,10 +549,8 @@ class Executor(object):
 
     def __init__(self, place=None):
         if place is None:
-            if core.is_compiled_with_cuda():
-                self.place = core.CUDAPlace(0)
-            else:
-                self.place = core.CPUPlace()
+            expected_place = framework._current_expected_place()
+            self.place = expected_place
         else:
             self.place = place
         self.program_caches = dict()
@@ -559,6 +564,9 @@ class Executor(object):
         self._closed = False
         self.pruned_program_scope_caches = dict()
 
+        self._auto_checkpoint_name = unique_name.generate(
+            "__auto_checkpoint_executor__")
+
     def _get_scope_cache(self, program_cache_key):
         return self.scope_caches.get(program_cache_key, None)
 
@@ -846,6 +854,7 @@ class Executor(object):
 
     def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                       return_numpy, return_merged):
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         exe = program._executor
         # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
         # instead of program. We will add support for checking Vars in Graph
@@ -889,6 +898,16 @@ class Executor(object):
                 res.append(res_dict)
             exe.feed_tensors_into_local_scopes(res)
 
+        if hasattr(program._program, 'lr_sheduler'):
+            lr_sheduler = program._program.lr_sheduler
+            assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program._program.global_block().vars[lr_sheduler._var_name]
+            lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
+            exe.feed_and_split_tensor_into_local_scopes({
+                lr_sheduler._var_name: lr_tensor
+            })
+
         fetch_var_names = list(map(_to_name_str, fetch_list))
         tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
         return as_numpy(tensors) if return_numpy else tensors
@@ -1152,6 +1171,28 @@ class Executor(object):
 
         compiled = isinstance(program, compiler.CompiledProgram)
 
+        # Check if fluid.data() variable no feed data
+        if use_prune:
+            if compiled:
+                global_block = program._program.global_block()
+            else:
+                global_block = program.global_block()
+            for varname in global_block.vars:
+                vardesc = global_block.desc.find_var(cpt.to_bytes(varname))
+                varobj = global_block.vars[varname]
+
+                # Can not check var build by fluid.layers.data(), bucause fluid.layers.data() had not set need_check_feed
+                if vardesc.persistable() == False and \
+                    vardesc.type() == core.VarDesc.VarType.LOD_TENSOR and \
+                    vardesc.need_check_feed() == True and \
+                    varobj._stop_gradient == True and \
+                    varobj.is_data == True and \
+                    varobj.belong_to_optimizer == False and \
+                    varname not in feed:
+                    raise ValueError('Need feed data for variable %s' % varname)
+
+        acp._auto_checkpoint(self, program)
+
         # For backward compatibility, run directly.
         if not compiled:
             # In distributed training, the compiled program is saved in Program._graph
@@ -1196,7 +1237,7 @@ class Executor(object):
 
     def _run_program(self, program, feed, fetch_list, feed_var_name,
                      fetch_var_name, scope, return_numpy, use_program_cache):
-
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         if feed is None:
             feed = {}
         elif isinstance(feed, (list, tuple)):
@@ -1252,6 +1293,16 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
+        if hasattr(program, 'lr_sheduler'):
+            assert isinstance(program.lr_sheduler,
+                              _LRScheduler), "must be _LRScheduler"
+            lr_sheduler = program.lr_sheduler
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
         if not use_program_cache:
             self._default_executor.run(program.desc, scope, 0, True, True,
                                        fetch_var_name)
@@ -1300,6 +1351,12 @@ class Executor(object):
                          fetch_list=None,
                          fetch_info=None,
                          print_period=100):
+        is_heter = 0
+        if not program._fleet_opt is None:
+            if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
+                is_heter = 1
+            if program._fleet_opt("trainer", "") == "HeterXpuTrainer":
+                is_heter = 1
         if scope is None:
             scope = global_scope()
         if fetch_list is None:
@@ -1308,6 +1365,11 @@ class Executor(object):
             fetch_info = []
         assert len(fetch_list) == len(fetch_info)
         compiled = isinstance(program, compiler.CompiledProgram)
+        if is_heter:
+            from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+            from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
+            fu = FleetUtil()
+            ret = fu.split_program_by_device(program)
         if not compiled:
             # TODO: Need a better way to distinguish and specify different execution mode
             if program._pipeline_opt:
@@ -1317,6 +1379,8 @@ class Executor(object):
                 trainer = TrainerFactory()._create_trainer(program._fleet_opt)
                 trainer._set_thread_barrier(program._is_distributed)
             trainer._set_program(program)
+            if is_heter:
+                trainer._set_heter_info(ret)
         else:
             if program._pipeline_opt:
                 trainer = TrainerFactory()._create_trainer(
@@ -1476,6 +1540,60 @@ class Executor(object):
                                       debug, fetch_list, fetch_info,
                                       print_period, fetch_handler)
 
+    def start_heter_trainer(self,
+                            program=None,
+                            scope=None,
+                            debug=False,
+                            fetch_list=None,
+                            fetch_info=None,
+                            print_period=100,
+                            fetch_handler=None):
+        return self._start_heter_trainer(program, scope, False, debug,
+                                         fetch_list, fetch_info, print_period,
+                                         fetch_handler)
+
+    def _start_heter_trainer(self,
+                             program=None,
+                             scope=None,
+                             is_infer=False,
+                             debug=False,
+                             fetch_list=None,
+                             fetch_info=None,
+                             print_period=100,
+                             fetch_handler=None):
+
+        scope, trainer = self._prepare_trainer(
+            program=program,
+            dataset=None,
+            scope=scope,
+            thread=1,
+            debug=debug,
+            fetch_list=fetch_list,
+            fetch_info=fetch_info,
+            print_period=print_period)
+
+        trainer._set_infer(is_infer)
+        trainer._gen_trainer_desc()
+
+        self._dump_debug_info(program=program, trainer=trainer)
+
+        trainer_instance = self._default_executor.init_for_dataset(
+            program.desc, trainer._desc(), scope, None)
+
+        #if fetch_handler is not None:
+        #    scope0 = trainer_instance.get_worker_scope(0)
+        #    fetch_monitor = FetchHandlerMonitor(scope0, fetch_handler)
+        #    fetch_monitor.start()
+        #    self._default_executor.run_from_dataset(trainer_instance)
+        #    fetch_monitor.stop()
+        #    self._default_executor.release_trainer(trainer_instance)
+        #else:
+
+        self._default_executor.run_from_dataset(trainer_instance)
+        #self._default_executor.release_trainer(trainer_instance)
+
+        return trainer_instance
+
     def train_from_dataset(self,
                            program=None,
                            dataset=None,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a7faf4041cfe496142427c6c6f110d849a54cca4..ef50294b8e762ae84f9b37f2571458e6588c4bc6 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -48,6 +48,7 @@ __all__ = [
     'cuda_pinned_places',
     'in_dygraph_mode',
     'is_compiled_with_cuda',
+    'is_compiled_with_xpu',
     'Variable',
     'ComplexVariable',
     'load_op_library',
@@ -64,7 +65,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _dygraph_tracer_ = None
-_dygraph_current_expected_place_ = None
+_global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
 
@@ -247,7 +248,26 @@ def _dygraph_tracer():
 
 
 def _current_expected_place():
-    return _dygraph_current_expected_place_
+    global _global_expected_place_
+    if _global_expected_place_ is None:
+        if core.is_compiled_with_cuda():
+            _global_expected_place_ = core.CUDAPlace(0)
+        else:
+            _global_expected_place_ = core.CPUPlace()
+
+    return _global_expected_place_
+
+
+def _set_dygraph_tracer_expected_place(place):
+    global _dygraph_tracer_
+    if _dygraph_tracer_ is not None:
+        _dygraph_tracer_._expected_place = place
+
+
+def _set_expected_place(place):
+    global _global_expected_place_
+    _global_expected_place_ = place
+    _set_dygraph_tracer_expected_place(place)
 
 
 # TODO(zhiqiu): remove this function.
@@ -291,6 +311,21 @@ def _cuda_ids():
     return device_ids
 
 
+def is_compiled_with_xpu():
+    """
+    Whether this whl package can be used to run the model on XPU.
+
+    Returns (bool): support xpu or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            support_xpu = fluid.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
 def is_compiled_with_cuda():
     """
     Whether this whl package can be used to run the model on GPU.
@@ -1689,34 +1724,40 @@ def get_all_op_protos():
 
 class ComplexVariable(object):
     """
-    The Variable defined on the complex number domain. It contains two common 
-    real number Variables as its members, :attr:`real` and :attr:`imag` 
+    The ComplexTensor defined on the complex number domain. It contains two common 
+    real number Tensor as its members, :attr:`real` and :attr:`imag` 
     holding the real part and imaginary part of complex numbers respectively.
     
     **Notes**:
-        **The constructor of ComplexVariable should not be invoked directly.**
+        **The constructor of ComplexTensor should not be invoked directly.**
 
-        **Only support dygraph mode at present. Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph ComplexVariable with complex number data.**
+        **Only support dygraph mode at present. Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph ComplexTensor with complex number data.**
 
     Args:
-        real (Variable): The Variable holding real-part data.
-        imag (Variable): The Variable holding imaginery-part data.
+        real (Tensor): The Tensor holding real-part data.
+        imag (Tensor): The Tensor holding imaginery-part data.
     
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            a = np.array([1.0+2.0j, 0.2])
-            with fluid.dygraph.guard():
-                var = fluid.dygraph.to_variable(a, name="new_var")
-                print(var.name, var.dtype, var.shape)
-                # ({'real': u'new_var.real', 'imag': u'new_var.imag'}, 'complex128', [2L]) 
-                print(var.numpy())
-                # [1. +2.j 0.2+0.j]
+            paddle.enable_imperative()
+            x = paddle.to_tensor([1.0+2.0j, 0.2])
+            print(x.name, x.dtype, x.shape)
+            # ({'real': 'generated_tensor_0.real', 'imag': 'generated_tensor_0.imag'}, 'complex128', [2L])
+            print(x.numpy())
+            # [1. +2.j 0.2+0.j]
+            print(type(x))
+            # <class 'paddle.ComplexTensor'>
     """
 
+    def __new__(cls, *arg, **kwargs):
+        cls.__module__ = "paddle"
+        cls.__name__ = "ComplexTensor"
+        return super(ComplexVariable, cls).__new__(cls)
+
     def __init__(self, real, imag):
         assert real.shape == imag.shape, "The real part and imaginary part " \
             "of a ComplexVariable should have the same shape!"
@@ -1763,7 +1804,9 @@ class ComplexVariable(object):
         return self.real.numpy() + 1j * self.imag.numpy()
 
     def __str__(self):
-        return "REAL: " + self.real.__str__() + "IMAG: " + self.imag.__str__()
+        return "ComplexTensor[real]: %s\n%s\nComplexTensor[imag]: %s\n%s" % (
+            self.real.name, str(self.real.value().get_tensor()), self.imag.name,
+            str(self.imag.value().get_tensor()))
 
     __repr__ = __str__
 
@@ -2385,12 +2428,29 @@ class Operator(object):
     def _is_optimize_op(self):
         op_maker = core.op_proto_and_checker_maker
         OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
+
+        if not self.desc.has_attr(op_maker.kOpRoleAttrName()):
+            return False
+
         op_role = self.desc.attr(op_maker.kOpRoleAttrName())
         if op_role & int(OPTIMIZE):
             return True
-        else:
+
+        return False
+
+    def _is_backward_op(self):
+        op_maker = core.op_proto_and_checker_maker
+        BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
+
+        if not self.desc.has_attr(op_maker.kOpRoleAttrName()):
             return False
 
+        op_role = self.desc.attr(op_maker.kOpRoleAttrName())
+        if op_role & int(BACKWARD):
+            return True
+
+        return False
+
 
 class Block(object):
     """
@@ -3942,6 +4002,10 @@ class Program(object):
         # appending gradients times
         self._appending_grad_times = 0
 
+        # identifier for auto checkpoint
+        self._auto_checkpoint_name = unique_name.generate(
+            "__auto_checkpoint_program__")
+
         # compiled program, i.e. Graph
         self._graph = None
 
@@ -4386,6 +4450,8 @@ class Program(object):
             p._current_role = self._current_role
             p.__op_role_var = self.__op_role_var
             p._appending_grad_times = self._appending_grad_times
+            if hasattr(self, 'lr_sheduler'):
+                p.lr_sheduler = self.lr_sheduler
 
             #NOTE(zhiqiu): we sync the cloned program, to update its program by
             # its desc.
@@ -5071,12 +5137,13 @@ class Parameter(Variable):
 
 class ParamBase(core.VarBase):
     """
-    ParamBase is derived from VarBase( Which is the Variable in Dygraph Mode ). A ParamBase is a persistable
-    VarBase, and will be updated by optimizers after each iteration.
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
+    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    after each iteration.
     The training of a neural network is essentially the updating of
     its ParamBase.
 
-    Relative to a general Variable, a ParamBase has several its own
+    Relative to a general Tensor, a ParamBase has several its own
     member variables:
 
     Args:
@@ -5154,7 +5221,7 @@ class ParamBase(core.VarBase):
             .. code-block:: python
 
                 import paddle
-                paddle.enable_imperative()
+                paddle.disable_static()
                 conv = paddle.nn.Conv2D(3, 3, 5)
                 print(conv.weight)
                 # Parameter: conv2d_0.w_0
@@ -5163,13 +5230,10 @@ class ParamBase(core.VarBase):
                 #   - layout: NCHW
                 #   - dtype: float
                 #   - data: [...] 
-                paddle.disable_imperative()
+                paddle.enable_static()
         """
-        tensor = self.value().get_tensor()
-        if tensor._is_initialized():
-            return 'Parameter: %s\n%s' % (self.name, str(tensor))
-        else:
-            return 'Parameter: %s, not initialized' % (self.name)
+        return "Parameter containing:\n  {}\n  - stop_gradient: {}".format(
+            super(ParamBase, self).__str__(), self.stop_gradient)
 
     __repr__ = __str__
 
@@ -5390,14 +5454,14 @@ def _dygraph_guard(tracer):
 
 @signature_safe_contextmanager
 def _dygraph_place_guard(place):
-    global _dygraph_current_expected_place_
-    tmp_place = _dygraph_current_expected_place_
-    _dygraph_current_expected_place_ = place
+    global _global_expected_place_
+    tmp_place = _global_expected_place_
+    _global_expected_place_ = place
 
     try:
         yield
     finally:
-        _dygraph_current_expected_place_ = tmp_place
+        _global_expected_place_ = tmp_place
 
 
 def load_op_library(lib_filename):
diff --git a/python/paddle/fluid/generator.py b/python/paddle/fluid/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e11b2e484dce1dd4260b3052d0f0a58f3cfc420a
--- /dev/null
+++ b/python/paddle/fluid/generator.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is definition of generator class, which is for managing the state of the algorithm that produces pseudo random numbers."""
+
+from . import core
+
+__all__ = ['Generator']
+
+default_rng_seed_val = 34342423252
+
+
+class Generator(object):
+    """Generator class"""
+
+    def __init__(self, device="CPU"):
+        """init"""
+        self.device = device
+        seed_in = default_rng_seed_val
+        if self.device == "CPU":
+            self.generator = core.Generator()
+            # self.generator.manual_seed(seed_in)
+        else:
+            raise ValueError(
+                "generator class with device %s does not exist, currently only support generator with device 'CPU' "
+                % device)
+
+    def get_state(self):
+        return self.generator.get_state()
+
+    def set_state(self, state):
+        self.generator.set_state(state)
+
+    def manual_seed(self, seed):
+        self.generator.manual_seed(seed)
+
+    def seed(self):
+        return self.generator.seed()
+
+    def initial_seed(self):
+        return self.generator.initial_seed()
+
+    def random(self):
+        return self.generator.random()
+
+    def get_cpu_engine(self):
+        return self.generator.get_cpu_engine()
+
+    def set_cpu_engine(self, cpu_engine):
+        self.generator.set_cpu_engine(cpu_engine)
diff --git a/python/paddle/fleet/runtime/__init__.py b/python/paddle/fluid/incubate/checkpoint/__init__.py
similarity index 87%
rename from python/paddle/fleet/runtime/__init__.py
rename to python/paddle/fluid/incubate/checkpoint/__init__.py
index f38287cf51a728011d16f735e58ec54a7cdfe0c8..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/fleet/runtime/__init__.py
+++ b/python/paddle/fluid/incubate/checkpoint/__init__.py
@@ -11,7 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .collective_runtime import CollectiveRuntime
-
-__all__ = ["CollectiveRuntime"]
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad51a043a0a50f89f77811adb7f95759a4f220be
--- /dev/null
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -0,0 +1,687 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import logging
+import hashlib
+import json
+import os
+import six
+import time
+import collections
+from threading import Thread, current_thread
+from contextlib import contextmanager
+
+from paddle.fluid import unique_name, compiler
+from .checkpoint_saver import SerializableBase, CheckpointSaver, PaddleModel
+from paddle.fluid.framework import in_dygraph_mode, Program
+
+g_train_epoch_range = None
+g_checker = None
+
+logger = None
+
+generator = unique_name.UniqueNameGenerator()
+
+CONST_CHECKPOINT = "checkpoint"
+CONST_MEMORYINIT = "memory_init"
+
+# auto checkpoint by dataloader event.
+CONST_DACP_TYPE = "dacp"
+# auto checkpoint by loop range.
+CONST_ACP_TYPE = "acp"
+g_acp_type = None
+g_program_attr = {}  # program_name->can_be_auto_checkpoint
+
+
+def _get_logger(log_level, name="auto_checkpoint"):
+    global logger
+    if logger != None:
+        return logger
+
+    logger = logging.getLogger(name)
+    logger.setLevel(log_level)
+    logger.propagate = False
+
+    log_handler = logging.StreamHandler()
+    log_format = logging.Formatter(
+        '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+    log_handler.setFormatter(log_format)
+    logger.addHandler(log_handler)
+
+    return logger
+
+
+def _thread_checker():
+    assert current_thread().name == "MainThread", \
+        "auto checkpoint must run under main thread"
+
+
+class AutoCheckpointChecker(object):
+    def __init__(self):
+        self._run_env = None
+        self._platform = None
+        self._job_id = None
+        self._hdfs_home = None
+        self._hdfs_name = None
+        self._hdfs_ugi = None
+        self._hdfs_checkpoint_path = None
+        self._trainer_id = None
+        self._ce_test = None
+
+        self._run_env = os.getenv("PADDLE_RUNNING_ENV")
+        if self._run_env != "PADDLE_EDL_AUTO_CHECKPOINT":
+            return
+
+        try:
+            self._platform = os.environ["PADDLE_RUNNING_PLATFORM"]
+            self._job_id = os.environ["PADDLE_JOB_ID"]
+            self._hdfs_home = os.environ["PADDLE_EDL_HDFS_HOME"]
+            self._hdfs_name = os.environ["PADDLE_EDL_HDFS_NAME"]
+            self._hdfs_ugi = os.environ["PADDLE_EDL_HDFS_UGI"]
+            self._hdfs_checkpoint_path = os.environ[
+                "PADDLE_EDL_HDFS_CHECKPOINT_PATH"]
+            self._trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
+
+            self._ce_test = int(os.getenv("PADDLE_EDL_ONLY_FOR_CE_TEST", "0"))
+            self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
+
+            self._save_checkpoint_inter = int(
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+
+            if not self._ce_test:
+                assert len(self._hdfs_home) > 3 and \
+                    len(self._hdfs_name) > 6 and \
+                    len(self._hdfs_ugi) > 3 and \
+                    len(self._hdfs_checkpoint_path) > 0, "hdfs environ must set"
+            else:
+                assert len(self._hdfs_home) > 3 and \
+                    len(self._hdfs_checkpoint_path) > 0, "hdfs environ must set"
+
+        except Exception as e:
+            logger.fatal("exception:{}".format(e))
+            sys.exit(1)
+
+    def get_range_checkpoint_path(self, name):
+        return "{}/{}/range/{}".format(self.hdfs_checkpoint_path, self.job_id,
+                                       name)
+
+    def get_exe_checkpoint_path(self, name):
+        return "{}/{}/exe/{}".format(self.hdfs_checkpoint_path, self.job_id,
+                                     name)
+
+    def get_job_path(self):
+        return "{}/{}".format(self.hdfs_checkpoint_path, self.job_id)
+
+    @property
+    def save_checkpoint_inter(self):
+        return self._save_checkpoint_inter
+
+    def valid(self):
+        if in_dygraph_mode():
+            return False
+
+        return  self._run_env is not None and \
+            self._platform is not None and \
+            self._job_id is not None and \
+            self._hdfs_home is not None and \
+            self._hdfs_name is not None and \
+            self._hdfs_ugi is not None and \
+            self._hdfs_checkpoint_path is not None and \
+            self._trainer_id is not None
+
+    def __str__(self):
+        return "run_env:{} platform:{} job_id:{} \
+            hdfs_home:{} hdfs_name:{} hdfs_ugi:{} \
+            hdfs_checkpoint_path:{} trainer_id:{} ce_test".format(
+            self._run_env, self._platform, self._hdfs_home, self._hdfs_name,
+            self._hdfs_ugi, self._hdfs_checkpoint_path, self._trainer_id,
+            self._ce_test)
+
+    @property
+    def trainer_id(self):
+        return self._trainer_id
+
+    @property
+    def run_env(self):
+        return self._run_env
+
+    @property
+    def platform(self):
+        return self._platform
+
+    @property
+    def job_id(self):
+        return self._job_id
+
+    @property
+    def hdfs_home(self):
+        return self._hdfs_home
+
+    @property
+    def hdfs_name(self):
+        return self._hdfs_name
+
+    @property
+    def ce_test(self):
+        return self._ce_test
+
+    @property
+    def hdfs_ugi(self):
+        return self._hdfs_ugi
+
+    @property
+    def hdfs_checkpoint_path(self):
+        return self._hdfs_checkpoint_path
+
+    @staticmethod
+    def generate_range_name():
+        return generator("_range_")
+
+
+class ExeTrainStatus(SerializableBase):
+    def __init__(self):
+        self._epoch_no = -1  # start epoch_no
+        self._hash_key = None
+        self._key = None
+        self._checkpoint_path = None
+        self._checkpoint_no = None
+        self._restored_from = None
+        self._exe = None
+        self._program = None
+        self._exe_name = None
+        self._program_name = None
+
+        self._file_name = "exe_train_status"
+
+    def __eq__(self, t):
+        return self._epoch_no == t._epoch_no and \
+            self._hash_key == t._hash_key and \
+            self._key == t._key and \
+            self._checkpoint_path == t._checkpoint_path and \
+            self._checkpoint_no == t._checkpoint_no and \
+            self._exe_name == t._exe_name and \
+            self._program_name == t._program_name
+
+    def __ne__(self, t):
+        return not self == t
+
+    def serialize(self, path):
+        file_name = "{}/{}".format(path, self._file_name)
+        with open(file_name, 'w') as f:
+            s = self._serialize()
+            f.write(s)
+
+    def _serialize(self, pop_keys=["restored_from"]):
+        d = self._to_dict()
+        for k in pop_keys:
+            d.pop(k, None)
+        return json.dumps(d)
+
+    def deserialize(self, path):
+        d = None
+        file_name = "{}/{}".format(path, self._file_name)
+        with open(file_name, 'r') as f:
+            s = f.read()
+            self._deserialize(s)
+
+    def _deserialize(self, s):
+        d = json.loads(s)
+        self._epoch_no = d["epoch_no"]
+        self._key = d["key"]
+        self._hash_key = d["hash_key"]
+        self._checkpoint_path = d["checkpoint_path"]
+        self._checkpoint_no = d["checkpoint_no"]
+        self._exe_name = d["exe_name"]
+        self._program_name = d["program_name"]
+
+    def _to_dict(self):
+        return {
+            "epoch_no": self._epoch_no,
+            "key": self._key,
+            "hash_key": self._hash_key,
+            "checkpoint_path": self._checkpoint_path,
+            "restored_from": self._restored_from,
+            "exe_name": self._exe_name,
+            "program_name": self._program_name,
+            "checkpoint_no": self._checkpoint_no
+        }
+
+    def __str__(self):
+        return self._serialize([])
+
+
+class TrainEpochRange(SerializableBase):
+    def __init__(self,
+                 max_epoch_num,
+                 name,
+                 checkpoint_inter=None,
+                 restored=True):
+        self._max_epoch_num = max_epoch_num
+        self._epoch_no = -1  # current epoch_no
+        self._name = name
+        self._restored_from = None
+        self._exe_status = {}
+        self._flag_generated = False
+
+        self._checker = g_checker
+        if checkpoint_inter is not None:
+            self._save_checkpoint_inter = checkpoint_inter
+        else:
+            self._save_checkpoint_inter = self._checker.save_checkpoint_inter
+        assert self._save_checkpoint_inter >= 0, "checkpointer:{} must >=0".format(
+            self._save_checkpoint_inter)
+        self._last_checkpoint_time = time.time()
+
+        self._load_cp_nos = None
+        self._checkpoint_epoch_no = None
+
+        if not self._checker.valid():
+            return
+
+        self._file_name = "range_train_status"
+
+        if not restored:
+            return
+
+        self._checkpoint_path = self._checker.get_range_checkpoint_path(name)
+
+        config = {
+            "fs.default.name": self._checker.hdfs_name,
+            "hadoop.job.ugi": self._checker.hdfs_ugi
+        }
+
+        if self._checker.ce_test:
+            config = None
+
+        from paddle.distributed.fleet.utils.fs import HDFSClient
+        self._hdfs = HDFSClient(self._checker.hdfs_home, config)
+
+        self._cper = CheckpointSaver(self._hdfs)
+
+        _thread_checker()
+
+        self._get_last_valid_checkpoint()
+
+    def _look_for_valid(self, cp_nos):
+        cps = []
+        epoch_no = -1
+        for i in cp_nos[::-1]:
+            t = TrainEpochRange(self._max_epoch_num, self.name, restored=False)
+            self._cper.load_checkpoint(
+                self._checkpoint_path, [t],
+                self._checker.trainer_id,
+                checkpoint_no=i,
+                local_cache_path=self._checker._fs_cache)
+            cps.append(t)
+            logger.debug("look for valid:{} t:{}".format(i, t._serialize()))
+            if epoch_no < 0:
+                epoch_no = t._epoch_no
+            else:
+                if epoch_no - t._epoch_no >= 1:
+                    return t, i
+        return None, None
+
+    def _get_last_valid_checkpoint(self):
+        self._load_cp_nos = self._cper.get_checkpoint_no(self._checkpoint_path)
+        logger.info("find checkpoint nos:{}".format(self._load_cp_nos))
+
+        if len(self._load_cp_nos) < 1:
+            self._restored_from = CONST_MEMORYINIT
+            return
+
+        if g_acp_type == CONST_ACP_TYPE:
+            # get the last one
+            self._cper.load_checkpoint(
+                self._checkpoint_path, [self],
+                self._checker.trainer_id,
+                local_cache_path=self._checker._fs_cache)
+            self._restored_from = CONST_CHECKPOINT
+            self._checkpoint_epoch_no = self._epoch_no
+
+            logger.info("load tain_epoch_range checkpoint:{}".format(
+                self._serialize()))
+
+        elif g_acp_type == CONST_DACP_TYPE:
+            t, i = self._look_for_valid(self._load_cp_nos)
+            if t is None:
+                self._restored_from = CONST_MEMORYINIT
+                return
+
+            self._cper.load_checkpoint(
+                self._checkpoint_path, [self],
+                self._checker.trainer_id,
+                checkpoint_no=i,
+                local_cache_path=self._checker._fs_cache)
+
+            self._restored_from = CONST_CHECKPOINT
+            self._checkpoint_epoch_no = self._epoch_no
+            logger.info("load tain_epoch_range checkpoint:{}".format(
+                self._serialize()))
+        else:
+            assert False, "not supported acp_type:{}".format(g_acp_type)
+
+    def _to_dict(self):
+        d = {
+            "max_epoch_num": self._max_epoch_num,
+            "epoch_no": self._epoch_no,
+            "name": self._name,
+            "checkpoint_path": self._checkpoint_path,
+            "restored_from": self._restored_from,
+            "checkpoint_epoch_no": self._checkpoint_epoch_no
+        }
+        return d
+
+    def __str__(self):
+        return self._serialize([])
+
+    @property
+    def name(self):
+        return self._name
+
+    def serialize(self, path):
+        file_name = "{}/{}".format(path, self._file_name)
+        with open(file_name, 'w') as f:
+            s = self._serialize()
+            f.write(s)
+
+    def _serialize(self, pop_keys=["restored_from", "checkpoint_epoch_no"]):
+        # self
+        d = self._to_dict()
+        for k in pop_keys:
+            d.pop(k, None)
+
+        # registerd exes
+        d["exe_status"] = {}
+        e = d["exe_status"]
+        for k, t in six.iteritems(self._exe_status):
+            e[t._key] = t._serialize()
+        return json.dumps(d)
+
+    @property
+    def restored_from(self):
+        return self._restored_from
+
+    def deserialize(self, path):
+        d = None
+        file_name = "{}/{}".format(path, self._file_name)
+        with open(file_name, 'r') as f:
+            d = json.load(f)
+
+        # self
+        self._max_epoch_num = d["max_epoch_num"]
+        self._epoch_no = d["epoch_no"]
+        self._name = d["name"]
+        self._checkpoint_path = d["checkpoint_path"]
+
+        # exes status
+        e = d["exe_status"]
+        for k, v in six.iteritems(e):
+            t = ExeTrainStatus()
+            t._deserialize(v)
+            self._exe_status[k] = t
+
+    def next(self):
+        _thread_checker()
+
+        if self._max_epoch_num < 0:
+            self._max_epoch_num = sys.maxint
+
+        assert self._epoch_no >= -1, "self._epoch_no:{} must >=-1".format(
+            self._epoch_no)
+
+        self._last_checkpoint_time = time.time()
+        start = self._epoch_no + 1
+        logger.info("started epoch_no:{} max_epoch_num:{}".format(
+            start, self._max_epoch_num))
+
+        for i in range(start, self._max_epoch_num):
+            self._epoch_no = i
+            yield i
+
+            self.save_checkpoint()
+
+    def get(self):
+        return self._epoch_no
+
+    def save_checkpoint(self):
+        # not save last one because exe and program can't be restored.
+        if self._checker.trainer_id == 0:
+
+            if time.time() - self._last_checkpoint_time >= \
+                    self._save_checkpoint_inter:
+                if g_acp_type == CONST_ACP_TYPE:
+                    # not save the last one
+                    if self._max_epoch_num > 0 and self._epoch_no != self._max_epoch_num - 1:
+                        self._save_checkpoint()
+                elif g_acp_type == CONST_DACP_TYPE:
+                    self._save_checkpoint()
+                else:
+                    assert False, "not supported acp_type:{}".format(g_acp_type)
+            self._last_checkpoint_time = time.time()
+
+    def _save_checkpoint(self):
+        """
+        status => /jobid/xxx_range_xx/range/
+        model =>                       /exe/
+        """
+        if not self._checker.valid():
+            return
+
+        e = self._exe_status
+        for k, t in six.iteritems(self._exe_status):
+            m = PaddleModel(t._exe, t._program)
+            p = self._checker.get_exe_checkpoint_path(t._hash_key)
+            t._epoch_no = self.get()
+            path, checkpoint_no = self._cper.save_checkpoint(
+                p, [m],
+                self._checker.trainer_id,
+                local_cache_path=self._checker._fs_cache)
+            # index info
+            t._checkpoint_path = path
+            t._checkpoint_no = checkpoint_no
+
+            e[t._key] = t
+
+            logger.debug("save executor checkpoint:{}".format(t._serialize()))
+
+        if len(self._exe_status) > 0:
+            self._cper.save_checkpoint(
+                self._checkpoint_path, [self],
+                local_cache_path=self._checker._fs_cache)
+            logger.info("save train_epoch_range checkpoint:{}".format(
+                self._serialize()))
+
+            self._generate_flag()
+
+    def _generate_flag(self):
+        if self._flag_generated:
+            return
+
+        name = "can_be_auto_checkpoint.flag"
+        path = self._checker.get_job_path() + "/" + name
+        logger.info("this job can_be_auto_checkpoint")
+        self._hdfs.mkdirs(self._checker.get_job_path())
+        self._hdfs.touch(path, exist_ok=True)
+
+        self._flag_generated = True
+
+
+def _get_train_epoch_range():
+    return g_train_epoch_range
+
+
+def _check_program_oprole(program):
+    global_block = program.global_block()
+    has_backward = False
+    has_opt = False
+    for idx, op in enumerate(global_block.ops):
+        if op._is_backward_op():
+            has_backward = True
+
+        if op._is_optimize_op():
+            has_opt = True
+
+        if has_backward and has_opt:
+            return True
+
+    return False
+
+
+def _can_auto_checkpoint(prog):
+    if not isinstance(prog, compiler.CompiledProgram) and \
+            not isinstance(prog, Program):
+        return False
+
+    if isinstance(prog, compiler.CompiledProgram):
+        if prog._program is None or \
+                prog._program._is_distributed:
+            return False
+    else:
+        if prog._is_distributed:
+            return False
+
+    program = _get_valid_program(prog)
+
+    if program._auto_checkpoint_name in g_program_attr:
+        if not g_program_attr[program._auto_checkpoint_name]:
+            return False
+    else:
+        ret = False
+        if isinstance(program, compiler.CompiledProgram):
+            ret = _check_program_oprole(program._program)
+        else:
+            ret = _check_program_oprole(program)
+
+        g_program_attr[program._auto_checkpoint_name] = ret
+        if not ret:
+            logger.debug("program {} need't to auto checkpoint".format(
+                program._auto_checkpoint_name))
+            return False
+
+    return g_checker.valid() and g_train_epoch_range is not None
+
+
+def _get_running_key(exe_name, program_name):
+    return "{}_{}".format(exe_name, program_name)
+
+
+def _get_checker():
+    _get_logger(20)
+    global g_checker
+    if g_checker is None:
+        g_checker = AutoCheckpointChecker()
+
+    return g_checker
+
+
+def _normal_yield(max_epoch_num):
+    if max_epoch_num < 0:
+        max_epoch_num = sys.maxint
+    for i in range(0, max_epoch_num):
+        yield i
+
+    return
+
+
+def train_epoch_range(max_epoch_num, save_checkpoint_inter=None):
+    global g_acp_type
+    if not _get_checker().valid():
+        logger.warning(
+            "auto checkpoint will take effect  automaticly on PaddleCloud")
+        for i in _normal_yield(max_epoch_num):
+            yield i
+
+        return
+
+    if g_acp_type == CONST_DACP_TYPE:
+        for i in _normal_yield(max_epoch_num):
+            yield i
+
+        return
+
+    g_acp_type = CONST_ACP_TYPE
+    logger.info("acp_type:{}".format(g_acp_type))
+
+    global g_train_epoch_range
+    try:
+        g_train_epoch_range = TrainEpochRange(
+            max_epoch_num,
+            g_checker.generate_range_name(),
+            checkpoint_inter=save_checkpoint_inter)
+
+        for i in g_train_epoch_range.next():
+            yield i
+    finally:
+        g_train_epoch_range = None
+
+
+def _get_valid_program(prog):
+    if isinstance(prog, compiler.CompiledProgram):
+        return prog._program
+
+    return prog
+
+
+def _auto_checkpoint(exe, prog):
+    _get_checker()
+
+    assert exe._auto_checkpoint_name != None
+    if not _can_auto_checkpoint(prog):
+        return
+
+    program = _get_valid_program(prog)
+    assert program._auto_checkpoint_name != None
+
+    exe_status = g_train_epoch_range._exe_status
+    key = _get_running_key(exe._auto_checkpoint_name,
+                           program._auto_checkpoint_name)
+
+    if g_train_epoch_range.restored_from == CONST_CHECKPOINT:
+        assert key in exe_status, "when restored key:{} must be in train_epoch_range:{}".format(
+            key, g_train_epoch_range)
+
+    t = None
+    if key in exe_status:
+        t = exe_status[key]
+        if t._restored_from is None:
+            a = CheckpointSaver(g_train_epoch_range._hdfs)
+            m = PaddleModel(exe, program)
+            a.load_checkpoint(
+                g_checker.get_exe_checkpoint_path(key), [m],
+                trainer_id=g_checker.trainer_id,
+                checkpoint_no=t._checkpoint_no,
+                local_cache_path=g_checker._fs_cache)
+            t._restored_from = CONST_CHECKPOINT
+            logger.info("load executor checkpoint {}".format(t))
+        t._exe = exe
+        t._program = program
+        t._epoch_no = g_train_epoch_range.get()
+    else:
+        t = ExeTrainStatus()
+        t._epoch_no = g_train_epoch_range.get()
+        t._hash_key = key
+        t._key = key
+        t._restored_from = CONST_MEMORYINIT
+        t._exe = exe
+        t._program = program
+        t._exe_name = exe._auto_checkpoint_name
+        t._program_name = program._auto_checkpoint_name
+
+        # register this <exe,program,io>
+        exe_status[key] = t
+
+        logger.info("not found checkpoint, so train from epoch 0")
+
+    _thread_checker()
diff --git a/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..08400ab13a25dd0f460bb3aedc30936bbca0d83a
--- /dev/null
+++ b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...compiler import CompiledProgram
+
+
+class SerializableBase(object):
+    def serialize(self, path):
+        raise NotImplementedError
+
+    def deserialize(self, path):
+        raise NotImplementedError
+
+
+class PaddleModel(SerializableBase):
+    def __init__(self, exe, program):
+        self._exe = exe
+        self._origin_program = program
+        self._program = program
+        if isinstance(program, CompiledProgram):
+            self._program = program._program
+
+        self._file_name = "_paddle_fleet_param__"
+
+    def serialize(self, path):
+        from ...io import save_persistables
+        save_persistables(
+            executor=self._exe,
+            dirname=path,
+            main_program=self._program,
+            filename=self._file_name)
+
+    def deserialize(self, path):
+        from ...io import load_persistables
+        load_persistables(
+            executor=self._exe,
+            dirname=path,
+            main_program=self._program,
+            filename=self._file_name)
+
+
+class CheckpointSaver(object):
+    def __init__(self, fs):
+        self._fs = fs
+        self._checkpoint_prefix = "__paddle_checkpoint__"
+
+    def save_checkpoint(self,
+                        path,
+                        slists,
+                        trainer_id=None,
+                        local_cache_path=".cache"):
+        """
+        Serialize objects in slists to path
+        Return really saved path and checkpoint_no
+        """
+        if not self._fs.is_exist(path):
+            self._fs.mkdirs(path)
+        else:
+            assert self._fs.is_dir(path), "path:{} must be a directory".format(
+                path)
+
+        max_no = self._get_last_checkpoint_no(path)
+        if max_no < 0:
+            max_no = -1
+        max_no += 1
+
+        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
+        tmp_path = "{}.tmp".format(real_path)
+        saved_path = tmp_path
+
+        from paddle.distributed.fleet.utils.fs import LocalFS
+        local_fs = LocalFS()
+
+        cache_path = None
+        if self._fs.need_upload_download():
+            cache_path = "{}/{}.{}.saved_cache".format(
+                local_cache_path, self._checkpoint_prefix, max_no)
+
+            if trainer_id is not None:
+                cache_path = "{}.{}".format(cache_path, trainer_id)
+
+            if not local_fs.is_exist(cache_path):
+                local_fs.mkdirs(cache_path)
+            else:
+                assert local_fs.is_dir(cache_path), \
+                    "cache path:{} must be a directory".format(cache_path)
+
+            saved_path = cache_path
+
+        for s in slists:
+            s.serialize(saved_path)
+
+        if self._fs.need_upload_download():
+            self._fs.delete(tmp_path)
+            self._fs.upload(cache_path, tmp_path)
+            local_fs.delete(cache_path)
+        self._fs.mv(tmp_path, real_path)
+
+        return real_path, max_no
+
+    def load_checkpoint(self,
+                        path,
+                        slists,
+                        trainer_id,
+                        local_cache_path=".cache",
+                        checkpoint_no=None,
+                        ignore_empty=True):
+        """
+        Deserialize objects in slists from path
+        Return really load path
+        """
+        if checkpoint_no is None:
+            max_no = self._get_last_checkpoint_no(path)
+
+            if not ignore_empty:
+                assert max_no >= 0, "Can't find checkpoint"
+
+            if max_no < 0:
+                return None
+
+            checkpoint_no = max_no
+        else:
+            assert isinstance(checkpoint_no, int)
+            assert checkpoint_no >= 0
+
+        from paddle.distributed.fleet.utils.fs import LocalFS
+        local_fs = LocalFS()
+        if self._fs.need_upload_download():
+            cache_path = "{}/{}.{}.load_cache".format(
+                local_cache_path, self._checkpoint_prefix, checkpoint_no)
+
+            if trainer_id is not None:
+                cache_path = "{}.{}".format(cache_path, trainer_id)
+
+            if not local_fs.is_exist(local_cache_path):
+                local_fs.mkdirs(local_cache_path)
+            if local_fs.is_exist(cache_path):
+                local_fs.delete(cache_path)
+
+        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix,
+                                      checkpoint_no)
+        load_path = real_path
+        if self._fs.need_upload_download():
+            self._fs.download(real_path, cache_path)
+            load_path = cache_path
+
+        for s in slists:
+            s.deserialize(load_path)
+
+        if self._fs.need_upload_download() and cache_path:
+            local_fs.delete(cache_path)
+
+        return real_path
+
+    def get_checkpoint_no(self, root_path):
+        a = []
+        dirs = self._fs.list_dirs(root_path)
+        for d in dirs:
+            g = d.split(".")
+            if len(g) != 2:
+                continue
+
+            if g[0] != self._checkpoint_prefix:
+                continue
+
+            try:
+                n = int(g[1])
+                a.append(n)
+            except:
+                continue
+
+        a.sort()
+        return a
+
+    def _get_last_checkpoint_no(self, root_path):
+        """
+        only get the first depth
+        """
+        a = self.get_checkpoint_no(root_path)
+        if len(a) > 0:
+            return a[-1]
+
+        return -1
+
+    def clean_redundant_checkpoints(self, root_path, reserved=[]):
+        max_no = self._get_last_checkpoint_no(root_path)
+        if max_no < 0:
+            return
+
+        s = set(reserved)
+        if len(s) == 0:
+            s.add(max_no)
+
+        dirs = self._fs.list_dirs(root_path)
+        for d in dirs:
+            g = d.split(".")
+            if len(g) != 2:
+                continue
+
+            if g[0] != self._checkpoint_prefix:
+                continue
+
+            try:
+                n = int(g[1])
+                if n not in s:
+                    path = "{}/{}.{}".format(root_path, self._checkpoint_prefix,
+                                             n)
+                    self._fs.delete(path)
+            except Exception as e:
+                print(e)
+                continue
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index 26085ec846512eefd3df962c88e56228daf34784..f885e51ef7f0d82ca50c7beb6ee6cd443dfc61d4 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -21,7 +21,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGD
 
 from paddle.fluid.incubate.fleet.base.mode import Mode
-from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
+from paddle.distributed.fleet.base.role_maker import RoleMakerBase
 from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision
 from . import mode
 
@@ -149,6 +149,16 @@ class Fleet(object):
         """
         return self._role_maker.is_server()
 
+    def is_xpu(self):
+        """
+        Check whether the node is an instance of server.
+
+        Returns:
+            bool: True if this is a node of server,
+                  False if not.
+        """
+        return self._role_maker.is_xpu()
+
     def split_files(self, files):
         """
         split files before distributed training,
@@ -199,7 +209,10 @@ class Fleet(object):
         self._executor = Executor(fluid.CPUPlace())
 
         if role_maker and not isinstance(role_maker, RoleMakerBase):
-            raise TypeError("role_maker must be an instance of RoleMakerBase")
+            from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase as RoleMakerBaseIncubate
+            if role_maker and not isinstance(role_maker, RoleMakerBaseIncubate):
+                raise TypeError(
+                    "role_maker must be an instance of RoleMakerBase")
 
         self._role_maker = role_maker
         self._role_maker.generate_role()
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 8596bd05a8685f6c4feccdeecd295fd10abb09c9..7f8db694d3601be072ab30ffbbd345b25ffafd80 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -28,6 +28,7 @@ __all__ = [
 class Role:
     WORKER = 1
     SERVER = 2
+    XPU = 3
 
 
 class MockBarrier(object):
@@ -988,6 +989,147 @@ class GeneralRoleMaker(RoleMakerBase):
         http_server.stop()
 
 
+class HeterRoleMaker(GeneralRoleMaker):
+    """
+    This role maker is for general use, you can set os.environ to customize:
+        PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, separated by ','
+        PADDLE_TRAINER_ENDPOINTS     : all trainers' ip:port, separated by ','
+        TRAINING_ROLE                : TRAINER or PSERVER
+        PADDLE_TRAINER_ID            : current trainer id (only for trainer),
+                                       it is index in PADDLE_TRAINER_ENDPOINTS
+        PADDLE_PSERVER_ID            : current pserver id (only for pserver)
+                                       it is index in PADDLE_PSERVERS_IP_PORT_LIST
+    """
+
+    def generate_role(self):
+        """
+        generate role for general role maker
+        """
+        if not self._role_is_generated:
+            eplist = os.environ["PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
+            training_role = os.environ["TRAINING_ROLE"]
+            worker_endpoints = os.environ["PADDLE_TRAINER_ENDPOINTS"].split(",")
+            trainers_num = len(worker_endpoints)
+            xpu_endpoints = os.environ["PADDLE_XPU_ENDPOINTS"].split(",")
+            xpu_num = len(xpu_endpoints)
+            if training_role not in ["TRAINER", "PSERVER", "XPU"]:
+                raise ValueError(
+                    "TRAINING_ROLE must be PSERVER or TRAINER or XPU")
+            if training_role == "TRAINER":
+                role = Role.WORKER
+                current_id = int(os.environ["PADDLE_TRAINER_ID"])
+                self._node_type = 1
+                self._cur_endpoint = worker_endpoints[current_id]
+                gloo = fluid.core.Gloo()
+                gloo.init(current_id,
+                          len(worker_endpoints),
+                          self._hdfs_path.rstrip("/") + "/trainer",
+                          self._hdfs_name, self._hdfs_ugi, self._iface,
+                          self._prefix)
+                self._node_type_comm = gloo
+            elif training_role == "XPU":
+                role = Role.XPU
+                current_id = int(os.environ["PADDLE_XPU_ID"])
+                self._node_type = 2
+                self._cur_endpoint = xpu_endpoints[current_id]
+                gloo = fluid.core.Gloo()
+                gloo.init(current_id,
+                          len(xpu_endpoints),
+                          self._hdfs_path.rstrip("/") + "/xpu", self._hdfs_name,
+                          self._hdfs_ugi, self._iface, self._prefix)
+                self._node_type_comm = gloo
+            elif training_role == "PSERVER":
+                role = Role.SERVER
+                if os.environ.get("PADDLE_PSERVER_ID") is not None:
+                    current_id = int(os.environ["PADDLE_PSERVER_ID"])
+                    cur_endpoint = eplist[current_id]
+                else:
+                    # this is for compatible with paddlecloud
+                    cur_ip = os.environ["POD_IP"]
+                    cur_port = os.environ["PADDLE_PORT"]
+                    cur_endpoint = ":".join([cur_ip, cur_port])
+                    current_id = eplist.index(cur_endpoint)
+                self._node_type = 0
+                self._cur_endpoint = cur_endpoint
+                gloo = fluid.core.Gloo()
+                gloo.init(current_id,
+                          len(eplist),
+                          self._hdfs_path.rstrip("/") + "/pserver",
+                          self._hdfs_name, self._hdfs_ugi, self._iface,
+                          self._prefix)
+                self._node_type_comm = gloo
+
+            if training_role == "TRAINER" or training_role == "XPU":
+                gloo = fluid.core.Gloo()
+                heter_list = worker_endpoints + xpu_endpoints
+                gloo.init(
+                    heter_list.index(self._cur_endpoint),
+                    len(heter_list),
+                    self._hdfs_path.rstrip("/") + "/heter", self._hdfs_name,
+                    self._hdfs_ugi, self._iface, self._prefix)
+                self._heter_comm = gloo
+
+            gloo = fluid.core.Gloo()
+            all_list = worker_endpoints + eplist + xpu_endpoints
+            gloo.init(
+                all_list.index(self._cur_endpoint),
+                len(all_list),
+                self._hdfs_path.rstrip("/") + "/all", self._hdfs_name,
+                self._hdfs_ugi, self._iface, self._prefix)
+
+            self._all_comm = gloo
+            self._trainers_num = trainers_num
+            self._server_endpoints = eplist
+            self._role = role
+            self._current_id = current_id
+            self._rank = all_list.index(self._cur_endpoint)
+            self._size = len(all_list)
+            self._worker_endpoints = worker_endpoints
+            self._xpu_endpoints = xpu_endpoints
+            self._role_is_generated = True
+
+    def is_xpu(self):
+        """
+        whether current process is server
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.XPU
+
+    def is_first_xpu(self):
+        """
+        whether current process is worker of rank 0
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return self._role == Role.XPU and self._current_id == 0
+
+    def _barrier_xpu(self):
+        """
+        barrier all workers in current distributed job
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        if self.is_xpu():
+            self._node_type_comm.barrier()
+
+    def _barrier_heter(self):
+        """
+        barrier all workers in current distributed job
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        if self.is_xpu() or self.is_worker:
+            self._heter_comm.barrier()
+
+    def xpu_num(self):
+        """
+        """
+        if not self._role_is_generated:
+            self.generate_role()
+        return len(self._xpu_endpoints)
+
+
 class UserDefinedRoleMaker(RoleMakerBase):
     """
     UserDefinedRoleMaker is designed for worker and server assignment
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index 4bcd5196a3b3cc5a11a93d27d4dc3dffe61e8644..6e5aae82517d1e0f408ebd7311e1c77a86fe426f 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -26,7 +26,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
 
 from paddle.fluid import compiler
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel, CheckpointSaver
 
 import os
 import sys
@@ -46,21 +46,6 @@ class DistFCConfig(object):
         pass
 
 
-class TrainStatus(object):
-    def __init__(self, epoch_no=-1):
-        # completed epoch
-        self._epoch_no = epoch_no
-
-    def next(self):
-        return self._epoch_no + 1
-
-    def __eq__(self, t):
-        return self._epoch_no == t._epoch_no
-
-    def __ne__(self, t):
-        return not self == t
-
-
 class Collective(Fleet):
     def __init__(self):
         super(Collective, self).__init__(Mode.COLLECTIVE)
@@ -152,189 +137,58 @@ class Collective(Fleet):
 
         io.save_persistables(executor, dirname, main_program, filename=filename)
 
-    def _save_train_status(self, path, train_status):
-        d = {}
-        d["epoch_no"] = train_status._epoch_no
-
-        file_name = "{}/fleet_train_status".format(path)
-        with open(file_name, 'w') as f:
-            json.dump(d, f)
-
-    def _load_train_status(self, path):
-        file_name = "{}/fleet_train_status".format(path)
-
-        r = TrainStatus()
-        if not os.path.isfile(file_name):
-            return r
-
-        d = {}
-        with open(file_name, 'r') as f:
-            d = json.load(f)
-
-        assert "epoch_no" in d, "Can't find epoch_no in dict from train_status file:{}".format(
-            d)
-        r._epoch_no = d["epoch_no"]
-        assert r._epoch_no >= 0, "Data in checkpoint file is not valid:{}".format(
-            d)
-
-        return r
-
-    def _get_last_checkpoint_no(self, root_path, fs):
-        """
-        only get the first depth
-        """
-        max_no = -1
-        d = {}
-        dirs = fs.list_dirs(root_path)
-        for d in dirs:
-            g = d.split(".")
-            if len(g) != 2:
-                continue
-
-            if g[0] != "__paddle_fleet_checkpoint__":
-                continue
-
-            try:
-                n = int(g[1])
-                if n > max_no:
-                    max_no = n
-            except:
-                continue
-
-        return max_no
-
-    def clean_redundant_checkpoints(self,
-                                    root_path,
-                                    fs=LocalFS(),
-                                    checkpoint_num=1):
-        max_no = self._get_last_checkpoint_no(root_path, fs)
-        if max_no < 0:
-            return
-
-        if checkpoint_num < 1:
-            checkpoint_num = 1
-
-        dirs = fs.list_dirs(root_path)
-        for d in dirs:
-            g = d.split(".")
-            if len(g) != 2:
-                continue
-
-            if g[0] != self._checkpoint_prefix:
-                continue
-
-            try:
-                n = int(g[1])
-                if n <= max_no - checkpoint_num:
-                    path = "{}/{}.{}".format(root_path, self._checkpoint_prefix,
-                                             n)
-                    fs.delete(path)
-            except Exception as e:
-                print(e)
-                continue
-
     def save_checkpoint(self,
                         executor,
                         path,
+                        trainer_id,
                         train_status,
+                        fs,
                         main_program=None,
-                        fs=LocalFS(),
                         local_cache_path=".cache",
                         remain_all_checkpoint=True):
         """
         This function save persistables and current epoch num to path.
         """
-
         if main_program == None:
             main_program = self._transpiled_program
 
-        if not fs.is_exist(path):
-            fs.mkdirs(path)
-        else:
-            assert fs.is_dir(path), "path:%s must be a directory".format(path)
-
-        max_no = self._get_last_checkpoint_no(path, fs=fs)
-        if max_no < 0:
-            max_no = -1
-
-        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no + 1)
-        tmp_path = "{}.tmp".format(real_path)
-        saved_path = tmp_path
-
-        local_fs = LocalFS()
-
-        cache_path = None
-        if fs.need_upload_download():
-            cache_path = "{}/{}.{}.saved_cache".format(
-                local_cache_path, self._checkpoint_prefix, max_no + 1)
-            if not local_fs.is_exist(cache_path):
-                local_fs.mkdirs(cache_path)
-            else:
-                assert fs.is_dir(
-                    path), "cache path:{} must be a directory".format(
-                        cache_path)
-
-            saved_path = cache_path
-
-        self.save_persistables(
-            executor=executor,
-            dirname=saved_path,
-            main_program=main_program,
-            filename=self._param_file_name)
-        self._save_train_status(path=saved_path, train_status=train_status)
-
-        if fs.need_upload_download():
-            fs.delete(tmp_path)
-            fs.upload(cache_path, tmp_path)
-        fs.mv(tmp_path, real_path)
+        m = PaddleModel(executor, main_program)
+        t = train_status
+        c = CheckpointSaver(fs)
+        real_path, checkpoint_no = c.save_checkpoint(
+            path=path,
+            slists=[m, t],
+            trainer_id=trainer_id,
+            local_cache_path=local_cache_path)
 
         if not remain_all_checkpoint:
-            self.clean_redundant_checkpoints(path)
+            c.clean_redundant_checkpoints(path)
+
+        return real_path, checkpoint_no
 
     def load_checkpoint(self,
                         executor,
                         path,
                         trainer_id,
+                        train_status,
+                        fs,
                         main_program=None,
-                        fs=LocalFS(),
                         local_cache_path=".cache",
                         ignore_empty=True):
         """
         This function load persistables and current epoch num from path.
         """
-        max_no = self._get_last_checkpoint_no(path, fs)
-
-        if not ignore_empty:
-            assert max_no >= 0, "Can't find checkpoint"
-
-        if max_no < 0:
-            return None
-
-        local_fs = LocalFS()
-        if fs.need_upload_download():
-            cache_path = "{}/{}.{}.load_cache.{}".format(
-                local_cache_path, self._checkpoint_prefix, max_no, trainer_id)
-            if not local_fs.is_exist(local_cache_path):
-                local_fs.mkdirs(local_cache_path)
-            if local_fs.is_exist(cache_path):
-                local_fs.delete(cache_path)
-
-        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
-        load_path = real_path
-        if fs.need_upload_download():
-            fs.download(real_path, cache_path)
-            load_path = cache_path
 
         if main_program == None:
             main_program = self._transpiled_program
 
-        io.load_persistables(
-            executor=executor,
-            dirname=load_path,
-            main_program=main_program,
-            filename=self._param_file_name)
-
-        return self._load_train_status(load_path)
+        m = PaddleModel(executor, main_program)
+        c = CheckpointSaver(fs)
+        return c.load_checkpoint(
+            path, [m, train_status],
+            trainer_id=trainer_id,
+            ignore_empty=ignore_empty,
+            local_cache_path=local_cache_path)
 
 
 fleet = Collective()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index a7d86411e203728116604ffafddf36a1cfaed9b3..1a7a82fbfac19b41e8b96c231ca74398f6b2214c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -38,6 +38,7 @@ from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
 from paddle.fluid.incubate.fleet.parameter_server import version
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+from paddle.fluid.incubate.fleet.parameter_server.ir.public import _has_global_step
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import TrainerRuntimeConfig, DistributedStrategy, \
     SyncStrategy, AsyncStrategy, HalfAsyncStrategy, GeoStrategy, StrategyFactory
 
@@ -161,9 +162,9 @@ class FleetTranspiler(Fleet):
 
         print(trainer_config)
 
-        lrs = _get_lr_ops(self._origin_main_program)
+        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))
 
-        if len(lrs) > 0:
+        if lrs > 0:
             kwargs = {"need_global_step": "1"}
         else:
             kwargs = {"need_global_step": "0"}
@@ -186,14 +187,6 @@ class FleetTranspiler(Fleet):
             recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                 recv_type=1)
 
-        for name, ctx in send_ctx.items():
-            print("name: {}, ctx: {}".format(name, ctx))
-
-        print("==== = ==== =============== ====")
-
-        for name, ctx in recv_ctx.items():
-            print("name: {}, ctx: {}".format(name, ctx))
-
         from paddle.fluid.communicator import Communicator
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
@@ -393,6 +386,12 @@ class FleetTranspiler(Fleet):
                 "in fleet.save_inference_model() function, executor must be as Executor type"
             )
 
+        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
+        if not isinstance(executor.place, fluid.CPUPlace):
+            save_executor = Executor(fluid.CPUPlace())
+        else:
+            save_executor = executor
+
         if main_program is not None:
             if isinstance(main_program, CompiledProgram):
                 raise TypeError(
@@ -579,7 +578,7 @@ class FleetTranspiler(Fleet):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self._role_maker.worker_id(),
+                        "trainer_id": self._role_maker.worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
@@ -670,6 +669,11 @@ if you would like to save all variables in a
             raise TypeError(
                 "in fleet.save_persistables() function, executor must be as Executor type"
             )
+        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
+        if not isinstance(executor.place, fluid.CPUPlace):
+            save_executor = Executor(fluid.CPUPlace())
+        else:
+            save_executor = executor
 
         if main_program is None:
             main_program = self.main_program
@@ -679,7 +683,8 @@ if you would like to save all variables in a
                 "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
             )
 
-        self._save_distributed_persistables(executor, dirname, main_program)
+        self._save_distributed_persistables(save_executor, dirname,
+                                            main_program)
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 2056e3deb18476748df0e16bc18b59f0a1074d55..f9889997d9e38c98c4a736a62dbc72da7029f337 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -43,6 +43,8 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
+LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@"
+
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
@@ -62,6 +64,17 @@ def _get_lr_ops(program):
     return lr_ops
 
 
+def _has_global_step(lr_ops):
+    if len(lr_ops) > 0:
+        for idx, op in enumerate(lr_ops):
+            if op.type != 'increment':
+                continue
+            counter = op.input("X")[0]
+            if counter == LEARNING_RATE_DECAY_COUNTER:
+                return True
+    return False
+
+
 def is_sparse_op(op):
     if op.type == "lookup_table" and op.attr('is_sparse') is True and op.attr(
             'is_distributed') is False:
@@ -329,7 +342,7 @@ class CompileTimeStrategy(object):
 
                 is_distributed = True if param_name in distibuted_varnames else False
 
-                ctx = self.build_ctx(grad, self.grad_var_mapping, True, False,
+                ctx = self.build_ctx(grad, self.grad_var_mapping, True, True,
                                      True, is_distributed)
                 send_ctx[ctx.var_name()] = ctx
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 402250455f79dee24bc87ea7fb9136ae24a68e23..2a1945532e654605d2e2d45206daa3cd8306737f 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -23,6 +23,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
 from paddle.fluid.incubate.fleet.base.mode import Mode
 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
 from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
+from paddle.fluid.incubate.fleet.base.role_maker import HeterRoleMaker
 
 
 class PSLib(Fleet):
@@ -44,6 +45,9 @@ class PSLib(Fleet):
             role_maker = MPISymetricRoleMaker()
         super(PSLib, self).init(role_maker)
         self._fleet_ptr = fluid.core.Fleet()
+        self._heter_ptr = None
+        if isinstance(role_maker, HeterRoleMaker):
+            self._heter_ptr = fluid.core.Heter()
 
     def _set_client_communication_config(self, request_timeout_ms,
                                          connect_timeout_ms, max_retry):
@@ -77,23 +81,35 @@ class PSLib(Fleet):
                 raise Exception(
                     "You should run DistributedOptimizer.minimize() first")
             # barrier_all for init_server, wait for server starts
+            if isinstance(self._role_maker, HeterRoleMaker):
+                if self._role_maker.is_xpu():
+                    local_endpoint = self._role_maker.get_local_endpoint()
+                    local_endpoint = local_endpoint.split(":")
+                    self._heter_ptr.start_xpu_service(
+                        str(local_endpoint[0]), int(local_endpoint[1]))
             self._role_maker._barrier_all()
             self.all_ips_ = self._role_maker._all_gather(self._local_ip)
             # worker_index * 2 is for compatible with older versions of pslib
             self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
                                         self._role_maker._get_size(),
                                         self._role_maker.worker_index() * 2)
+            if isinstance(self._role_maker, HeterRoleMaker):
+                if self._role_maker.is_worker():
+                    self._heter_ptr.set_xpu_list(
+                        self._role_maker._xpu_endpoints)
+                    self._heter_ptr.create_client2xpu_connection()
             # barrier_all for init_worker
             self._role_maker._barrier_all()
             # prepare for client to client communication
-            info = self._fleet_ptr.get_clients_info()
-            all_info = self._role_maker._worker_gather(info[0])
-            self._fleet_ptr.gather_clients(all_info)
-            self._fleet_ptr.set_client2client_config(
-                self._client2client_request_timeout_ms,
-                self._client2client_connect_timeout_ms,
-                self._client2client_max_retry)
-            self._fleet_ptr.create_client2client_connection()
+            if self._role_maker.is_worker():
+                info = self._fleet_ptr.get_clients_info()
+                all_info = self._role_maker._worker_gather(info[0])
+                self._fleet_ptr.gather_clients(all_info)
+                self._fleet_ptr.set_client2client_config(
+                    self._client2client_request_timeout_ms,
+                    self._client2client_connect_timeout_ms,
+                    self._client2client_max_retry)
+                self._fleet_ptr.create_client2client_connection()
             # barrier for init model
             self._role_maker._barrier_worker()
             if self._role_maker.is_first_worker():
@@ -144,10 +160,16 @@ class PSLib(Fleet):
             >>> fleet.init_server("/you/path/to/model", mode = 0)
         """
         mode = kwargs.get("mode", 0)
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.load_model(model_dir, mode)
-        self._role_maker._barrier_worker()
+        if isinstance(self._role_maker, HeterRoleMaker):
+            self._role_maker._barrier_xpu()
+            if self._role_maker.is_first_xpu():
+                self._fleet_ptr.load_model(model_dir, mode)
+            self._role_maker._barrier_xpu()
+        else:
+            self._role_maker._barrier_worker()
+            if self._role_maker.is_first_worker():
+                self._fleet_ptr.load_model(model_dir, mode)
+            self._role_maker._barrier_worker()
 
     def run_server(self):
         """
@@ -185,6 +207,54 @@ class PSLib(Fleet):
             raise Exception(
                 "You should run DistributedOptimizer.minimize() first")
 
+    def end_pass(self, scope):
+        if self._role_maker.worker_index() < self._role_maker.xpu_num():
+            self._heter_ptr.end_pass(scope, self._role_maker.worker_index())
+            self._heter_ptr.stop_xpu_service(self._role_maker.worker_index())
+
+    def train_from_dataset(self,
+                           executor,
+                           program=None,
+                           dataset=None,
+                           scope=None,
+                           thread=0,
+                           debug=False,
+                           fetch_list=None,
+                           fetch_info=None,
+                           print_period=100,
+                           fetch_handler=None):
+        """
+
+        """
+
+        if self._role_maker.is_worker():
+            self._role_maker._barrier_heter()
+        executor.train_from_dataset(program, dataset, scope, thread, debug,
+                                    fetch_list, fetch_info, print_period,
+                                    fetch_handler)
+
+    def start_heter_trainer(self,
+                            executor,
+                            program=None,
+                            scope=None,
+                            debug=False,
+                            fetch_list=None,
+                            fetch_info=None,
+                            print_period=100,
+                            fetch_handler=None):
+        """
+
+        """
+
+        trainer_instance = executor.start_heter_trainer(
+            program, scope, debug, fetch_list, fetch_info, print_period,
+            fetch_handler)
+        if self._role_maker.is_xpu():
+            print("barrier heter")
+            self._role_maker._barrier_heter()
+            print("barrier heter")
+        executor._default_executor.release_trainer(trainer_instance)
+
     def stop_worker(self):
         """
         stop(): will be called after a user finishes his/her training task. Fleet instance will be
@@ -197,6 +267,7 @@ class PSLib(Fleet):
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
             self._fleet_ptr.stop_server()
+            self._heter_ptr.stop_xpu_service()
         self._role_maker._barrier_worker()
         self._role_maker._barrier_all()
         self._role_maker._finalize()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 232d3e0422e5542e1fd13efd80486ff9bb3d4a22..5cd1aa884a928db4980933091d951010ce347444 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -509,13 +509,15 @@ class DistributedAdam(DistributedOptimizerImplBase):
         opt_info = {}
         opt_info["program_id_to_worker"] = prog_id_to_worker
         opt_info["program_configs"] = program_configs
-        opt_info["trainer"] = "DistMultiTrainer"
+        opt_info["trainer"] = strategy.get("trainer", "DistMultiTrainer")
         opt_info["device_worker"] = strategy.get("device_worker", "DownpourSGD")
         opt_info["optimizer"] = "DownpourSGD"
         opt_info["fleet_desc"] = ps_param
         opt_info["worker_skipped_ops"] = worker_skipped_ops
         opt_info["use_cvm"] = strategy.get("use_cvm", False)
         opt_info["no_cvm"] = strategy.get("no_cvm", False)
+        opt_info["worker_class"] = strategy.get("worker_class",
+                                                "DownpourWorker")
         opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
         opt_info["local_tables"] = strategy.get("local_tables", [])
         opt_info["async_tables"] = strategy.get("async_tables", [])
@@ -529,6 +531,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         opt_info["dump_file_num"] = strategy.get("dump_file_num", 16)
         opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "")
         opt_info["dump_param"] = strategy.get("dump_param", [])
+        opt_info["worker_places"] = strategy.get("worker_places", [])
         if server._server.downpour_server_param.downpour_table_param[
                 0].accessor.accessor_class in [
                     "DownpourCtrAccessor", "DownpourCtrDoubleAccessor",
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index 3ae61891514ccaa96cc8e7429d1a988a4618173a..cb1a54ef19899059d1a46d0807ce58bf3b5ab8b5 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -14,6 +14,7 @@
 """Fleet Utils."""
 
 import collections
+import copy
 import json
 import logging
 import math
@@ -1615,3 +1616,123 @@ class FleetUtil(object):
         """
         program = utils.load_program(prog_path, is_text)
         utils.parse_program(program, output_dir)
+
+    def split_program_by_device(self, program):
+        ops_list = []
+        type_list = []
+        pre = None
+        type_cpu = "cpu"
+        for op in program.global_block().ops:
+            if op.has_attr("op_device"):
+                if pre is None or pre != op.attr("op_device"):
+                    ops_list.append([])
+                    type_list.append(
+                        op.attr("op_device")
+                        if op.attr("op_device") != "" else type_cpu)
+                ops_list[-1].append(op)
+                pre = op.attr("op_device")
+        l = len(type_list)
+        i = 0
+        type_heter = None
+        while i < l:
+            while i < l and type_list[i] == type_cpu:
+                i += 1
+            if i == l:
+                break
+
+            type_heter = type_list[i]
+            i += 1
+            start = i
+            valid = True
+            while i < l and type_list[i] != type_heter:
+                if type_list[i] != type_cpu:
+                    valid = False
+                    break
+                i += 1
+
+            if i == l:
+                break
+            elif not valid:
+                continue
+
+            for j in range(start, i):
+                for op in ops_list[j]:
+                    op._set_attr("op_device", type_heter)
+                type_list[j] = type_heter
+                j += 1
+
+        pre = None
+        merged_ops_list = []
+        merged_type_list = []
+        for i in range(l):
+            if pre is None or pre != type_list[i]:
+                merged_ops_list.append([])
+                merged_type_list.append(type_list[i])
+            merged_ops_list[-1].extend(ops_list[i])
+            pre = type_list[i]
+
+        data_vars = set()
+        for k in program.global_block().vars:
+            var = program.global_block().var(k)
+            if not var.persistable:
+                data_vars.add(var.name)
+
+        l = len(merged_ops_list)
+        inputs_pre = set()
+        outputs_pre = set()
+        in_from_pre = [[] for i in range(l)]
+        for i in range(l):
+            inputs = set()
+            outputs = set()
+            for op in merged_ops_list[i]:
+                for input in op.input_names:
+                    for tmp in op.input(input):
+                        if tmp not in outputs:
+                            inputs.add(tmp)
+                for output in op.output_names:
+                    for tmp in op.output(output):
+                        outputs.add(tmp)
+            if i == 0:
+                in_from_pre[i] = []
+            elif i == 1:
+                in_from_pre[i] = (outputs_pre | data_vars) & inputs
+            else:
+                in_from_pre[i] = outputs_pre & inputs
+            inputs_pre = copy.deepcopy(inputs)
+            outputs_pre = copy.deepcopy(outputs)
+
+        l = len(in_from_pre)
+        start_list = []
+        end_list = []
+        send_list = [[] for i in range(l)]
+        sum = 0
+        program_list = []
+        for i in range(l):
+            start_list.append(sum)
+            end_list.append(sum + len(merged_ops_list[i]) - 1)
+            sum += len(merged_ops_list[i])
+            if i < l - 1:
+                send_list[i].extend(list(in_from_pre[i + 1]))
+            prog = program.clone()
+            if merged_type_list[i] != type_cpu:
+                prog = prog._prune_with_input(
+                    list(in_from_pre[i]), list(send_list[i]))
+                program_list.append(prog)
+            else:
+                program_list.append(prog)
+        recv_list = [list(i) for i in in_from_pre]
+        found = False
+        heter_index = None
+        for i in range(len(merged_type_list)):
+            t = merged_type_list[i]
+            if t != type_cpu:
+                if found:
+                    print("only one region of program can be heter")
+                found = True
+                heter_index = i
+        if heter_index is None:
+            print("warning: non heter program")
+            return None
+        else:
+            return [start_list[heter_index], end_list[heter_index], send_list[heter_index], \
+                    recv_list[heter_index], program_list[heter_index]]
diff --git a/python/paddle/fluid/incubate/fleet/utils/fs.py b/python/paddle/fluid/incubate/fleet/utils/fs.py
index 4782c2f8d90c84027f9fd1d11022425a9ed6c84b..0ba06ef934a525d3801e233c6e2f124fb0a6df52 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fs.py
@@ -45,6 +45,10 @@ class FSTimeOut(Exception):
     pass
 
 
+class FSShellCmdAborted(ExecuteError):
+    pass
+
+
 class FS(object):
     @abc.abstractmethod
     def ls_dir(self, fs_path):
@@ -87,7 +91,7 @@ class FS(object):
         raise NotImplementedError
 
     @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path):
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
         raise NotImplementedError
 
     @abc.abstractmethod
@@ -98,6 +102,10 @@ class FS(object):
     def list_dirs(self, fs_path):
         raise NotImplementedError
 
+    @abc.abstractmethod
+    def touch(self, fs_path, exist_ok=True):
+        raise NotImplementedError
+
 
 class LocalFS(FS):
     def ls_dir(self, fs_path):
@@ -138,13 +146,21 @@ class LocalFS(FS):
     def is_exist(self, fs_path):
         return os.path.exists(fs_path)
 
-    def touch(self, fs_path):
-        return Path(fs_path).touch()
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return Path(fs_path).touch(exist_ok=True)
 
-    def mv(self, src_path, dst_path):
+    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
 
+        if overwrite and self.is_exist(dst_path):
+            self.delete(dst_path)
+
         if self.is_exist(dst_path):
             raise FSFileExistsError
 
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index 27f68076f27feb32f823977274d2b33bccf3eb1c..b136b3853ad8d9f6025db292a6414295ee12a5f2 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -26,8 +26,8 @@ import time
 import logging
 import six
 from . import fs
-from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut
-import paddle.fluid as fluid
+from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted
+from paddle.fluid import core
 import functools
 
 from pathlib import PurePosixPath, Path
@@ -36,21 +36,39 @@ import shutil
 __all__ = ["HDFSClient"]
 
 
-def _handle_errors(f):
-    def handler(*args, **kwargs):
-        start = time.time()
-        while True:
-            try:
-                return f(*args, **kwargs)
-            except ExecuteError as e:
-                o = args[0]
+def _handle_errors(max_time_out=None):
+    def decorator(f):
+        @functools.wraps(f)
+        def handler(*args, **kwargs):
+            o = args[0]
+            time_out = max_time_out
+            if time_out is None:
                 time_out = float(o._time_out) / 1000.0
-                inter = float(o._sleep_inter) / 1000.0
-                if time.time() - start >= time_out:
-                    raise FSTimeOut
-                time.sleep(inter)
+            else:
+                time_out /= 1000.0
+            inter = float(o._sleep_inter) / 1000.0
+
+            start = time.time()
+            last_print_time = start
+            while True:
+                try:
+                    return f(*args, **kwargs)
+                #important: only ExecuteError need to retry
+                except ExecuteError as e:
+                    if time.time() - start >= time_out:
+                        raise FSTimeOut("args:{} timeout:{}".format(
+                            args, time.time() - start))
+
+                    time.sleep(inter)
 
-    return functools.wraps(f)(handler)
+                if time.time() - last_print_time > 30:
+                    print("hadoop operator timeout:args:{} timeout:{}".format(
+                        args, time.time() - start))
+                    last_print_time = time.time()
+
+        return handler
+
+    return decorator
 
 
 class HDFSClient(FS):
@@ -72,6 +90,7 @@ class HDFSClient(FS):
         if configs:
             for k, v in six.iteritems(configs):
                 config_command = '-D%s=%s' % (k, v)
+                self.pre_commands.append(config_command)
 
         self._time_out = time_out
         self._sleep_inter = sleep_inter
@@ -80,17 +99,22 @@ class HDFSClient(FS):
             r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
 
     def _run_cmd(self, cmd, redirect_stderr=False):
-        ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
-        return int(ret), output.splitlines()
-
+        exe_cmd = "{} -{}".format(self._base_cmd, cmd)
+        ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr)
+        ret = int(ret)
+        if ret == 134:
+            raise FSShellCmdAborted(cmd)
+        return ret, output.splitlines()
+
+    @_handle_errors()
     def list_dirs(self, fs_path):
         if not self.is_exist(fs_path):
             return []
 
-        dirs, _ = self.ls_dir(fs_path)
+        dirs, files = self._ls_dir(fs_path)
         return dirs
 
-    @_handle_errors
+    @_handle_errors()
     def ls_dir(self, fs_path):
         """	
         list directory under fs_path, and only give the pure name, not include the fs_path	
@@ -98,11 +122,14 @@ class HDFSClient(FS):
         if not self.is_exist(fs_path):
             return [], []
 
-        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        return self._ls_dir(fs_path)
+
+    def _ls_dir(self, fs_path):
+        cmd = "ls {}".format(fs_path)
         ret, lines = self._run_cmd(cmd)
 
         if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
         dirs = []
         files = []
@@ -111,9 +138,6 @@ class HDFSClient(FS):
             if len(arr) != 8:
                 continue
 
-            if fs_path not in arr[7]:
-                continue
-
             p = PurePosixPath(arr[7])
             if arr[0][0] == 'd':
                 dirs.append(p.name)
@@ -130,18 +154,20 @@ class HDFSClient(FS):
 
         return None
 
-    @_handle_errors
+    @_handle_errors()
     def is_dir(self, fs_path):
         if not self.is_exist(fs_path):
             return False
 
-        cmd = "{} -test -d {}".format(
-            self._base_cmd, fs_path, redirect_stderr=True)
+        return self._is_dir(fs_path)
+
+    def _is_dir(self, fs_path):
+        cmd = "test -d {}".format(fs_path, redirect_stderr=True)
         ret, lines = self._run_cmd(cmd)
         if ret:
             # other error
-            if self._test_match(lines) != None:
-                raise ExecuteError
+            if self._test_match(lines):
+                raise ExecuteError(cmd)
 
             return False
 
@@ -151,94 +177,155 @@ class HDFSClient(FS):
         if not self.is_exist(fs_path):
             return False
 
-        return not self.is_dir(fs_path)
+        return not self._is_dir(fs_path)
 
-    @_handle_errors
+    @_handle_errors()
     def is_exist(self, fs_path):
-        cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
+        cmd = "ls {} ".format(fs_path)
         ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
             for l in out:
                 if "No such file or directory" in l:
                     return False
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
         return True
 
-    @_handle_errors
+    # can't retry
     def upload(self, local_path, fs_path):
         if self.is_exist(fs_path):
-            raise FSFileExistsError
+            raise FSFileExistsError("{} exists".format(fs_path))
 
         local = LocalFS()
         if not local.is_exist(local_path):
-            raise FSFileNotExistsError
-
-        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
-        ret, lines = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
+            raise FSFileNotExistsError("{} not exists".format(local_path))
+
+        return self._try_upload(local_path, fs_path)
+
+    @_handle_errors()
+    def _try_upload(self, local_path, fs_path):
+        cmd = "put {} {}".format(local_path, fs_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            self.delete(fs_path)
+            raise e
+
+    # can't retry
     def download(self, fs_path, local_path):
         if self.is_exist(local_path):
-            raise FSFileExistsError
+            raise FSFileExistsError("{} exists".format(local_path))
 
         if not self.is_exist(fs_path):
-            raise FSFileNotExistsError
-
-        cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
-        ret, lines = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
+            raise FSFileNotExistsError("{} not exits".format(fs_path))
+
+        return self._try_download(fs_path, local_path)
+
+    @_handle_errors()
+    def _try_download(self, fs_path, local_path):
+        cmd = "get {} {}".format(fs_path, local_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            local_fs = LocalFS()
+            local_fs.delete(local_path)
+            raise e
+
+    @_handle_errors()
     def mkdirs(self, fs_path):
         if self.is_exist(fs_path):
             return
 
-        cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
-        ret, lines = self._run_cmd(cmd)
+        out_hdfs = False
+
+        cmd = "mkdir {} ".format(fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
-            raise ExecuteError
+            for l in out:
+                if "No such file or directory" in l:
+                    out_hdfs = True
+                    break
+            if not out_hdfs:
+                raise ExecuteError(cmd)
+
+        if out_hdfs and not self.is_exist(fs_path):
+            cmd = "mkdir -p {}".format(fs_path)
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        if overwrite and self.is_exist(fs_dst_path):
+            self.delete(fs_dst_path)
 
-    @_handle_errors
-    def mv(self, fs_src_path, fs_dst_path, test_exists=True):
         if test_exists:
             if not self.is_exist(fs_src_path):
-                raise FSFileNotExistsError
+                raise FSFileNotExistsError("{} is not exists".format(
+                    fs_src_path))
 
             if self.is_exist(fs_dst_path):
-                raise FSFileExistsError
+                raise FSFileExistsError("{} exists already".format(
+                    fs_src_path, fs_dst_path, fs_dst_path))
+
+        return self._try_mv(fs_src_path, fs_dst_path)
+
+    @_handle_errors()
+    def _try_mv(self, fs_src_path, fs_dst_path):
+        cmd = "mv {} {}".format(fs_src_path, fs_dst_path)
+        ret = 0
+        try:
+            ret, _ = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            if not self.is_exist(fs_src_path) and \
+                    self.is_exist(fs_dst_path):
+                return
+            raise e
 
-        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
-        ret, _ = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
     def _rmr(self, fs_path):
-        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        cmd = "rmr {}".format(fs_path)
         ret, _ = self._run_cmd(cmd)
         if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
-    @_handle_errors
     def _rm(self, fs_path):
-        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        cmd = "rm {}".format(fs_path)
         ret, _ = self._run_cmd(cmd)
         if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)
 
+    @_handle_errors()
     def delete(self, fs_path):
         if not self.is_exist(fs_path):
             return
 
-        is_dir = self.is_dir(fs_path)
+        is_dir = self._is_dir(fs_path)
         if is_dir:
             return self._rmr(fs_path)
 
         return self._rm(fs_path)
 
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return self._touchz(fs_path)
+
+    @_handle_errors()
+    def _touchz(self, fs_path):
+        cmd = "touchz {}".format(fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
     def need_upload_download(self):
         return True
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 347927509e6d539555bd4d1b7a594febbc68f57b..15a3022f932f4a702bf7f94ed936468b6a06e94e 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -17,10 +17,12 @@ import warnings
 from .framework import Variable, in_dygraph_mode
 from .layer_helper import LayerHelper
 from .data_feeder import check_variable_and_dtype, check_dtype
+from ..utils import deprecated
 
 __all__ = ['one_hot', 'embedding']
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
     :alias_main: paddle.nn.functional.one_hot
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 201cc61e4d479dc11b169e02481ac4ff4780c2b8..ef469377acfbc0c2c521de61f8eacc0f7c9f0854 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import paddle
 from .framework import Program, program_guard, unique_name, cuda_places, cpu_places
 from .param_attr import ParamAttr
 from .initializer import Constant
@@ -44,10 +45,23 @@ class SimpleLayer(Layer):
 
 
 def run_check():
-    ''' install check to verify if install is success
-
+    """To check whether install is successful
     This func should not be called only if you need to verify installation
-    '''
+
+    Examples:
+        .. code-block: python
+
+            import paddle.fluid as fluid
+            fluid.install_check.run_check()
+
+            # If installed successfully, output may be
+            # Running Verify Fluid Program ... 
+            # W0805 04:24:59.496919 35357 device_context.cc:268] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.2, Runtime API Version: 10.1
+            # W0805 04:24:59.505594 35357 device_context.cc:276] device: 0, cuDNN Version: 7.6.
+            # Your Paddle Fluid works well on SINGLE GPU or CPU.
+            # Your Paddle Fluid works well on MUTIPLE GPU or CPU.
+            # Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now
+    """
     print("Running Verify Fluid Program ... ")
 
     device_list = []
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index ffe8939cd7a39cd7835fd9d0ab74dd66d4f24981..6e5f7fd035acfeab975f63b0794829d57f9bb239 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1669,9 +1669,6 @@ def _load_persistable_nodes(executor, dirname, graph):
 def save(program, model_path):
     """
     :api_attr: Static Graph
-	:alias_main: paddle.save
-	:alias: paddle.save,paddle.tensor.save,paddle.tensor.io.save
-	:old_api: paddle.fluid.save
 
     This function save parameters, optimizer information and network description to  model_path.
 
@@ -1733,9 +1730,6 @@ def save(program, model_path):
 def load(program, model_path, executor=None, var_list=None):
     """
     :api_attr: Static Graph
-	:alias_main: paddle.load
-	:alias: paddle.load,paddle.tensor.load,paddle.tensor.io.load
-	:old_api: paddle.fluid.io.load
 
     This function get parameters and optimizer information from program, and then get corresponding value from file.
     An exception will throw if shape or dtype of the parameters is not match.
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 0b57b3fefd414c483c537957ed6ca3cfdd58fa65..6e38c855562809fa38cddbf6e58eb4eee6b899f3 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -23,8 +23,13 @@ from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from .initializer import _global_weight_initializer, _global_bias_initializer
 
+__all__ = ['LayerHelperBase']
+
 
 class LayerHelperBase(object):
+    # global dtype
+    __dtype = "float32"
+
     def __init__(self, name, layer_type):
         self._layer_type = layer_type
         self._name = name
@@ -45,6 +50,14 @@ class LayerHelperBase(object):
     def startup_program(self):
         return default_startup_program()
 
+    @classmethod
+    def set_default_dtype(cls, dtype):
+        cls.__dtype = dtype
+
+    @classmethod
+    def get_default_dtype(cls):
+        return cls.__dtype
+
     def to_variable(self, value, name=None):
         """
         The API will create a ``Variable`` object from numpy\.ndarray or Variable object.
@@ -277,7 +290,7 @@ class LayerHelperBase(object):
     def create_parameter(self,
                          attr,
                          shape,
-                         dtype,
+                         dtype=None,
                          is_bias=False,
                          default_initializer=None,
                          stop_gradient=False,
@@ -299,6 +312,9 @@ class LayerHelperBase(object):
         if not attr:
             return None
         assert isinstance(attr, ParamAttr)
+        # set global dtype
+        if not dtype:
+            dtype = self.__dtype
         if is_bias:
             suffix = 'b'
             default_initializer = _global_bias_initializer(
@@ -372,6 +388,9 @@ class LayerHelperBase(object):
             based on operator's `VarTypeInference` implementation in
             infer_var_type.
         """
+        # set global dtype
+        if not dtype:
+            dtype = self.__dtype
         return self.main_program.current_block().create_var(
             name=unique_name.generate_with_ignorable_key(".".join(
                 [self.name, 'tmp'])),
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index b179d00626249849f64f0fc571cb2e85cf08ea05..2002b8a95decfd6d6c55538e2dff0a793828dd9b 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1511,7 +1511,7 @@ def array_write(x, i, array=None):
         assert i.shape == [
             1
         ], "The shape of index 'i' should be [1] in dygraph mode"
-        i = i.numpy()[0]
+        i = i.numpy().item(0)
         if array is None:
             array = create_array(x.dtype)
         assert isinstance(
@@ -1976,7 +1976,7 @@ def array_read(array, i):
         assert i.shape == [
             1
         ], "The shape of index 'i' should be [1] in dygraph mode"
-        i = i.numpy()[0]
+        i = i.numpy().item(0)
         return array[i]
 
     check_variable_and_dtype(i, 'i', ['int64'], 'array_read')
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 7906f563c0009ac37695f50c9dc2b035b8f004aa..ea6abe2d335e6669b27ba278c0faaca62ca0fdbb 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -471,9 +471,9 @@ def rpn_target_assign(bbox_pred,
 
 def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
     """
-	:alias_main: paddle.nn.functional.sigmoid_focal_loss
-	:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
-	:old_api: paddle.fluid.layers.sigmoid_focal_loss
+	:alias_main: paddle.nn.functional.sigmoid_focal_loss
+	:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
+	:old_api: paddle.fluid.layers.sigmoid_focal_loss
 
     **Sigmoid Focal Loss Operator.**
 
@@ -628,9 +628,9 @@ def detection_output(loc,
                      nms_eta=1.0,
                      return_index=False):
     """
-	:alias_main: paddle.nn.functional.detection_output
-	:alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output
-	:old_api: paddle.fluid.layers.detection_output
+	:alias_main: paddle.nn.functional.detection_output
+	:alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output
+	:old_api: paddle.fluid.layers.detection_output
 
     Given the regression locations, classification confidences and prior boxes,
     calculate the detection outputs by performing following steps:
@@ -761,9 +761,9 @@ def detection_output(loc,
 @templatedoc()
 def iou_similarity(x, y, box_normalized=True, name=None):
     """
-	:alias_main: paddle.nn.functional.iou_similarity
-	:alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity
-	:old_api: paddle.fluid.layers.iou_similarity
+	:alias_main: paddle.nn.functional.iou_similarity
+	:alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity
+	:old_api: paddle.fluid.layers.iou_similarity
 
     ${comment}
 
@@ -821,9 +821,9 @@ def box_coder(prior_box,
               name=None,
               axis=0):
     """
-	:alias_main: paddle.nn.functional.box_coder
-	:alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder
-	:old_api: paddle.fluid.layers.box_coder
+	:alias_main: paddle.nn.functional.box_coder
+	:alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder
+	:old_api: paddle.fluid.layers.box_coder
 
     **Box Coder Layer**
 
@@ -1012,9 +1012,9 @@ def yolov3_loss(x,
                 name=None,
                 scale_x_y=1.):
     """
-	:alias_main: paddle.nn.functional.yolov3_loss
-	:alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss
-	:old_api: paddle.fluid.layers.yolov3_loss
+	:alias_main: paddle.nn.functional.yolov3_loss
+	:alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss
+	:old_api: paddle.fluid.layers.yolov3_loss
 
     ${comment}
 
@@ -1139,9 +1139,9 @@ def yolo_box(x,
              name=None,
              scale_x_y=1.):
     """
-	:alias_main: paddle.nn.functional.yolo_box
-	:alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box
-	:old_api: paddle.fluid.layers.yolo_box
+	:alias_main: paddle.nn.functional.yolo_box
+	:alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box
+	:old_api: paddle.fluid.layers.yolo_box
 
     ${comment}
 
@@ -1318,9 +1318,9 @@ def bipartite_match(dist_matrix,
                     dist_threshold=None,
                     name=None):
     """
-	:alias_main: paddle.nn.functional.bipartite_match
-	:alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match
-	:old_api: paddle.fluid.layers.bipartite_match
+	:alias_main: paddle.nn.functional.bipartite_match
+	:alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match
+	:old_api: paddle.fluid.layers.bipartite_match
 
     This operator implements a greedy bipartite matching algorithm, which is
     used to obtain the matching with the maximum distance based on the input
@@ -1412,9 +1412,9 @@ def target_assign(input,
                   mismatch_value=None,
                   name=None):
     """
-	:alias_main: paddle.nn.functional.target_assign
-	:alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign
-	:old_api: paddle.fluid.layers.target_assign
+	:alias_main: paddle.nn.functional.target_assign
+	:alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign
+	:old_api: paddle.fluid.layers.target_assign
 
     This operator can be, for given the target bounding boxes or labels,
     to assign classification and regression targets to each prediction as well as
@@ -1530,9 +1530,9 @@ def ssd_loss(location,
              normalize=True,
              sample_size=None):
     """
-	:alias_main: paddle.nn.functional.ssd_loss
-	:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
-	:old_api: paddle.fluid.layers.ssd_loss
+	:alias_main: paddle.nn.functional.ssd_loss
+	:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
+	:old_api: paddle.fluid.layers.ssd_loss
 
     **Multi-box loss layer for object detection algorithm of SSD**
 
@@ -1777,9 +1777,9 @@ def prior_box(input,
               name=None,
               min_max_aspect_ratios_order=False):
     """
-	:alias_main: paddle.nn.functional.prior_box
-	:alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box
-	:old_api: paddle.fluid.layers.prior_box
+	:alias_main: paddle.nn.functional.prior_box
+	:alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box
+	:old_api: paddle.fluid.layers.prior_box
 
     This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
     Each position of the input produce N prior boxes, N is determined by
@@ -1938,9 +1938,9 @@ def density_prior_box(input,
                       flatten_to_2d=False,
                       name=None):
     """
-	:alias_main: paddle.nn.functional.density_prior_box
-	:alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box
-	:old_api: paddle.fluid.layers.density_prior_box
+	:alias_main: paddle.nn.functional.density_prior_box
+	:alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box
+	:old_api: paddle.fluid.layers.density_prior_box
 
 
     This op generates density prior boxes for SSD(Single Shot MultiBox Detector) 
@@ -2130,7 +2130,7 @@ def multi_box_head(inputs,
                    name=None,
                    min_max_aspect_ratios_order=False):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Base on SSD ((Single Shot MultiBox Detector) algorithm, generate prior boxes,
     regression location and classification confidence on multiple input feature
@@ -2407,9 +2407,9 @@ def anchor_generator(input,
                      offset=0.5,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.anchor_generator
-	:alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator
-	:old_api: paddle.fluid.layers.anchor_generator
+	:alias_main: paddle.nn.functional.anchor_generator
+	:alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator
+	:old_api: paddle.fluid.layers.anchor_generator
 
     **Anchor generator operator**
 
@@ -2612,9 +2612,9 @@ def generate_proposal_labels(rpn_rois,
                              is_cls_agnostic=False,
                              is_cascade_rcnn=False):
     """
-	:alias_main: paddle.nn.functional.generate_proposal_labels
-	:alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels
-	:old_api: paddle.fluid.layers.generate_proposal_labels
+	:alias_main: paddle.nn.functional.generate_proposal_labels
+	:alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels
+	:old_api: paddle.fluid.layers.generate_proposal_labels
 
     **Generate Proposal Labels of Faster-RCNN**
 
@@ -2737,9 +2737,9 @@ def generate_proposal_labels(rpn_rois,
 def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
                          labels_int32, num_classes, resolution):
     """
-	:alias_main: paddle.nn.functional.generate_mask_labels
-	:alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels
-	:old_api: paddle.fluid.layers.generate_mask_labels
+	:alias_main: paddle.nn.functional.generate_mask_labels
+	:alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels
+	:old_api: paddle.fluid.layers.generate_mask_labels
 
     **Generate Mask Labels for Mask-RCNN**
 
@@ -2896,9 +2896,9 @@ def generate_proposals(scores,
                        name=None,
                        return_rois_num=False):
     """
-	:alias_main: paddle.nn.functional.generate_proposals
-	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
-	:old_api: paddle.fluid.layers.generate_proposals
+	:alias_main: paddle.nn.functional.generate_proposals
+	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
+	:old_api: paddle.fluid.layers.generate_proposals
 
     **Generate proposal Faster-RCNN**
 
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 5c14d26f3fe242c87676c2d2c27aa12f555e5638..7aedb2ca2566c6da196b655d1aa24ed99150a709 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -25,8 +25,7 @@ from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype
 
 __all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
-    'templatedoc'
+    'generate_layer_fn', 'generate_activation_fn', 'autodoc', 'templatedoc'
 ]
 
 
@@ -82,8 +81,9 @@ def _generate_doc_string_(op_proto,
     buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
-        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        line_begin = '    {0}'.format(_convert_(each_input.name))
         buf.write(line_begin)
+        buf.write(" (Tensor): ")
         buf.write(escape_math(each_input.comment))
         if each_input.duplicable:
             buf.write("  Duplicatable.")
@@ -125,6 +125,8 @@ def _generate_doc_string_(op_proto,
         for each_opt in op_proto.outputs:
             if not each_opt.intermediate:
                 break
+        buf.write(_convert_(each_opt.name))
+        buf.write(' (Tensor): ')
         buf.write(escape_math(each_opt.comment))
 
     return buf.getvalue()
@@ -275,50 +277,11 @@ def generate_activation_fn(op_type):
     func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=[
-            "name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` ."
+            "name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`."
         ])
-    func.__doc__ = func.__doc__ + """
-
-Return type
-  Variable
-
-Examples:
-    .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-        x_data = np.array([1, 2, 3, 4]).astype(np.float32)
-        x = paddle.imperative.to_variable(x_data)
-        res = paddle.%s(x)
-        print(res.numpy())
-""" % op_type
     return func
 
 
-def deprecated(func_or_class):
-    """
-    Deprecated warning decorator. It will result a warning message.
-    Should be used before class or function, member function
-    """
-
-    @functools.wraps(func)
-    def func_wrapper(*args, **kwargs):
-        """
-        Wrap func with deprecated warning
-        """
-        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
-        warnings.warn(
-            "Call to deprecated function {}.".format(func.__name__),
-            category=DeprecationWarning,
-            stacklevel=2)
-        warnings.simplefilter('default', DeprecationWarning)  # reset filter
-        return func(*args, **kwargs)
-
-    return func_wrapper
-
-
 def autodoc(comment=""):
     def __impl__(func):
         func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
@@ -384,3 +347,14 @@ def templatedoc(op_type=None):
         return func
 
     return __impl__
+
+
+def add_sample_code(func, sample_code):
+    """
+    Append sample code for dynamically generated functions. 
+
+    Args:
+       func: The function of the function to be append sample code to.
+       sample_code: sample code session in rst format.
+    """
+    func.__doc__ = func.__doc__ + sample_code
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 4217a98798ebbb46cb5b84e4c15fea4b4f0840ac..f468815c99ea2751913c5535c721ee9a6a5c5052 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import numpy as np
 from functools import partial, reduce
+from paddle.utils import deprecated
 from . import nn
 from .layer_function_generator import templatedoc
 from ..layer_helper import LayerHelper
@@ -1619,6 +1620,7 @@ def huber_loss(input, label, delta):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.kl_div")
 @templatedoc()
 def kldiv_loss(x, target, reduction='mean', name=None):
     """
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index fd1e7f800b928cfcecb9e09877f08c42c81defa6..38fc34472c8bc64338e2468bdf3f4b0bab1370ce 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import warnings
 import inspect
+import paddle
 
 from .. import core
 from ..framework import Variable, unique_name
@@ -45,6 +46,7 @@ EXPRESSION_MAP = {
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
     "__floordiv__": "A //B",
+    "__rfloordiv__": "A //= B",
     "__mod__": "A % B",
     "__eq__": "A == B",
     "__ne__": "A != B",
@@ -54,6 +56,31 @@ EXPRESSION_MAP = {
     "__ge__": "A >= B"
 }
 
+# method for Tensor from paddle.tensor
+# edit it when paddle.tensor has new method about Tensor operation
+common_methods = [
+    'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
+    'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
+    'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
+    'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
+    'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
+    'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
+    'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
+    'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
+    'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
+    'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
+    'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
+    'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
+    'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
+    'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
+    'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
+    'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
+    'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
+    'elementwise_sub'
+]
+
+_already_patch_variable = False
+
 
 def monkey_patch_variable():
     def unique_tmp_name():
@@ -179,7 +206,7 @@ def monkey_patch_variable():
                    "out_dtype": out.dtype})
         return out
 
-    def _scalar_elementwise_op_(var, scale, bias):
+    def _scalar_op_(var, scale, bias):
         block = current_block(var)
         out = create_new_tmp_var(block, var.dtype)
         block.append_op(
@@ -191,27 +218,46 @@ def monkey_patch_variable():
         return out
 
     def _neg_(var):
-        return _scalar_elementwise_op_(var, -1.0, 0.0)
+        return _scalar_op_(var, -1.0, 0.0)
+
+    def _scalar_add_(var, value):
+        return _scalar_op_(var, 1.0, value)
 
-    def _scalar_elementwise_add_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, value)
+    def _scalar_sub_(var, value):
+        return _scalar_op_(var, 1.0, -value)
 
-    def _scalar_elementwise_sub_(var, value):
-        return _scalar_elementwise_op_(var, 1.0, -value)
+    def _scalar_rsub_(var, value):
+        return _scalar_op_(var, -1.0, value)
 
-    def _scalar_elementwise_rsub_(var, value):
-        return _scalar_elementwise_op_(var, -1.0, value)
+    def _scalar_mul_(var, value):
+        return _scalar_op_(var, value, 0.0)
 
-    def _scalar_elementwise_mul_(var, value):
-        return _scalar_elementwise_op_(var, value, 0.0)
+    def _scalar_div_(var, value):
+        return _scalar_op_(var, 1.0 / value, 0.0)
 
-    def _scalar_elementwise_div_(var, value):
-        return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
+    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
+    # for binary operator by using the api to achieve the type promotion
+    def _binary_method_creator_(op_type, reverse=False):
+        import paddle
+
+        def __impl__(self, other_var):
+            op = getattr(paddle, op_type)
+            if reverse:
+                return op(other_var, self)
+            else:
+                return op(self, other_var)
+
+        __impl__.__doc__ = """
+
+        See paddle.{}""".format(op_type)
+        __impl__.__name__ = op_type
+
+        return __impl__
 
-    def _elemwise_method_creator_(method_name,
-                                  op_type,
-                                  reverse=False,
-                                  scalar_method=None):
+    def _binary_creator_(method_name,
+                         op_type,
+                         reverse=False,
+                         scalar_method=None):
         def __impl__(self, other_var):
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
@@ -296,35 +342,56 @@ def monkey_patch_variable():
         __impl__.__name__ = method_name
         return __impl__
 
-    # inject methods
-    for method_name, op_type, reverse, scalar_method in (
-        ("__add__", "elementwise_add", False, _scalar_elementwise_add_),
-            # a+b == b+a. Do not need to reverse explicitly
-        ("__radd__", "elementwise_add", False, _scalar_elementwise_add_),
-        ("__sub__", "elementwise_sub", False, _scalar_elementwise_sub_),
-        ("__rsub__", "elementwise_sub", True, _scalar_elementwise_rsub_),
-        ("__mul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-            # a*b == b*a. Do not need to reverse explicitly
-        ("__rmul__", "elementwise_mul", False, _scalar_elementwise_mul_),
-        ("__div__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__truediv__", "elementwise_div", False, _scalar_elementwise_div_),
-        ("__rdiv__", "elementwise_div", True, None),
-        ("__rtruediv__", "elementwise_div", True, None),
-        ("__pow__", "elementwise_pow", False, None),
-        ("__rpow__", "elementwise_pow", True, None),
-        ("__floordiv__", "elementwise_floordiv", False, None),
-        ("__mod__", "elementwise_mod", False, None),
-            # for logical compare
-        ("__eq__", "equal", False, None),
-        ("__ne__", "not_equal", False, None),
-        ("__lt__", "less_than", False, None),
-        ("__le__", "less_equal", False, None),
-        ("__gt__", "greater_than", False, None),
-        ("__ge__", "greater_equal", False, None)):
-        setattr(Variable, method_name,
-                _elemwise_method_creator_(method_name, op_type, reverse,
-                                          scalar_method))
-
-    # b = -a
-    Variable.__neg__ = _neg_
-    Variable.astype = astype
+    variable_methods = [
+        #   b=-a
+        ('__neg__', _neg_),
+        ('astype', astype),
+        ('__add__', _binary_creator_('__add__', 'elementwise_add', False,
+                                     _scalar_add_)),
+        #  a+b == b+a. Do not need to reverse explicitly
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
+                                     _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
+                                      _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
+                                     _scalar_mul_)),
+        #  a*b == b*a. Do not need to reverse explicitly
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
+                                     None)),
+        ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
+                                      None)),
+        # These binary use paddle.optype
+        ('__div__', _binary_method_creator_('divide', False)),
+        ('__rdiv__', _binary_method_creator_('divide', True)),
+        ('__truediv__', _binary_method_creator_('divide', False)),
+        ('__rtruediv__', _binary_method_creator_('divide', True)),
+        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
+        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
+        ('__mod__', _binary_method_creator_('remainder', False)),
+        #  for logical compare
+        ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None))
+    ]
+
+    global _already_patch_variable
+    if not _already_patch_variable:
+        for method in variable_methods:
+            method_name = method[0]
+            method_impl = method[1]
+            setattr(Variable, method_name, method_impl)
+    else:
+        import paddle.tensor
+        for method_name in common_methods:
+            if hasattr(Variable, method_name): continue
+            method_impl = getattr(paddle.tensor, method_name, None)
+            if method_impl: setattr(Variable, method_name, method_impl)
+
+    _already_patch_variable = True
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
old mode 100644
new mode 100755
index a46391452bb2433cd2fd7e261fb1f5b87a735316..da66503ceee37e30fafa0d5402edd2a188578a0b
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ import six
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -35,6 +35,7 @@ from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
+from ...utils import deprecated
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
 from paddle.utils import deprecated
@@ -931,6 +932,7 @@ def cos_sim(X, Y):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout")
 def dropout(x,
             dropout_prob,
             is_test=False,
@@ -938,9 +940,6 @@ def dropout(x,
             name=None,
             dropout_implementation="downgrade_in_infer"):
     """
-    :alias_main: paddle.nn.functional.dropout
-	:alias: paddle.nn.functional.dropout,paddle.nn.functional.common.dropout
-	:old_api: paddle.fluid.layers.dropout
 
     Computes dropout.
 
@@ -1188,12 +1187,9 @@ def chunk_eval(input,
             num_correct_chunks)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
 def softmax(input, use_cudnn=False, name=None, axis=-1):
     """
-    :alias_main: paddle.nn.functional.softmax
-	:alias: paddle.nn.functional.softmax,paddle.nn.functional.activation.softmax
-	:old_api: paddle.fluid.layers.softmax
-
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
@@ -1307,8 +1303,8 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
     attrs = {"axis": axis, "use_cudnn": use_cudnn}
 
     helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'softmax')
+    check_variable_and_dtype(input, 'input/x',
+                             ['float16', 'float32', 'float64'], 'softmax')
 
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
@@ -3366,6 +3362,15 @@ def data_norm(input,
         "BatchSum": batch_sum,
         "BatchSquareSum": batch_square_sum
     }
+    attrs = {
+        "epsilon": epsilon,
+        "sync_stats": sync_stats,
+        "summary_decay_rate": summary_decay_rate,
+    }
+    if slot_dim > 0:
+        attrs["slot_dim"] = slot_dim
+    if enable_scale_and_shift:
+        attrs["enable_scale_and_shift"] = enable_scale_and_shift
     if enable_scale_and_shift:
         inputs["scale_w"] = scale_w
         inputs["bias"] = bias
@@ -3380,13 +3385,7 @@ def data_norm(input,
             "BatchSum": batch_sum,
             "BatchSquareSum": batch_square_sum
         },
-        attrs={
-            "epsilon": epsilon,
-            "slot_dim": slot_dim,
-            "sync_stats": sync_stats,
-            "summary_decay_rate": summary_decay_rate,
-            "enable_scale_and_shift": enable_scale_and_shift
-        })
+        attrs=attrs)
 
     return helper.append_activation(data_norm_out)
 
@@ -4401,12 +4400,9 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.mean")
 def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_mean
-	:alias: paddle.reduce_mean,paddle.tensor.reduce_mean,paddle.tensor.stat.reduce_mean
-	:old_api: paddle.fluid.layers.reduce_mean
-
     Computes the mean of the input tensor's elements along the given dimension.
 
     Args:
@@ -4455,31 +4451,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
     """
 
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-
-    if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] or len(dim) == len(
-            input.shape) else False
-        dim = dim if dim != None and dim != [] else [0]
-        return core.ops.reduce_mean(input, 'dim', dim, 'keep_dim', keep_dim,
-                                    'reduce_all', reduce_all)
-    attrs = {
-        'dim': dim if dim != None and dim != [] else [0],
-        'keep_dim': keep_dim,
-        'reduce_all': True
-        if dim == None or dim == [] or len(dim) == len(input.shape) else False
-    }
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_mean')
-    helper = LayerHelper('reduce_mean', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='reduce_mean',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs=attrs)
-    return out
+    return paddle.mean(x=input, axis=dim, keepdim=keep_dim, name=name)
 
 
 def reduce_max(input, dim=None, keep_dim=False, name=None):
@@ -4625,7 +4597,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
     Args:
         input (Variable): The input variable which is a Tensor, the data type is float32,
             float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the product is performed. If
+        dim (int|list|tuple, optional): The dimensions along which the product is performed. If
             :attr:`None`, multiply all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
@@ -4665,9 +4637,18 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+        if isinstance(dim, tuple):
+            dim = list(dim)
+        elif isinstance(dim, int):
+            dim = [dim]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".
+                format(type(dim)))
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod')
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_prod',
         inputs={'X': input},
@@ -4872,7 +4853,7 @@ def split(input, num_or_sections, dim=-1, name=None):
 
         if isinstance(dim, Variable):
             dim = dim.numpy()
-            dim = dim[0]
+            dim = dim.item(0)
         dim = (len(input.shape) + dim) if dim < 0 else dim
         attrs += ('axis', dim)
 
@@ -4897,7 +4878,7 @@ def split(input, num_or_sections, dim=-1, name=None):
 
     check_variable_and_dtype(
         input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'in64'], 'split')
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'split')
     check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
     check_type(dim, 'dim', (int, Variable), 'split')
     if isinstance(dim, Variable):
@@ -5055,6 +5036,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.matmul")
 def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     Applies matrix multiplication to two tensors.
@@ -5126,7 +5108,65 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
             y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
             out = fluid.layers.matmul(x, y, True, True)
     """
-    return paddle.matmul(x, y, transpose_x, transpose_y, alpha, name)
+    attrs = {
+        'transpose_X': transpose_x,
+        'transpose_Y': transpose_y,
+        'alpha': float(alpha),
+    }
+
+    if in_dygraph_mode():
+        out = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
+                        transpose_y, 'alpha', float(alpha))
+        return out
+
+    def __check_input(x, y):
+        var_names = {'x': x, 'y': y}
+        for name, val in var_names.items():
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
+        x_shape = list(x.shape)
+        y_shape = list(y.shape)
+        if len(x_shape) == 1:
+            x_shape = [1] + x_shape
+        if len(y_shape) == 1:
+            y_shape = y_shape + [1]
+
+        # check the inner 2 dimensions
+        if transpose_x:
+            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
+        if transpose_y:
+            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
+        if x_shape[-1] != y_shape[-2]:
+            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
+                "After performing an optional transpose, Input X's width should be "   \
+                "equal to Y's width for multiplication "                               \
+                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
+                (x_shape, y_shape)
+
+        if len(y_shape) > 2 and len(x_shape) > 2:
+            for i, dim_x in enumerate(x_shape[:-2]):
+                # don't check neg shape
+                if dim_x < 0 or y_shape[i] < 0:
+                    continue
+                if dim_x != y_shape[i]:
+                    raise ValueError(
+                        "When the matrix is larger than 2 dimensions, the higher "
+                        "dimensional values of the two matrices need to be equal. "
+                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
+                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+
+    __check_input(x, y)
+
+    helper = LayerHelper('matmul', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs=attrs)
+    return out
 
 
 def topk(input, k, name=None):
@@ -5831,6 +5871,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     return loss
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.one_hot')
 def one_hot(input, depth, allow_out_of_range=False):
     """
 
@@ -5916,7 +5957,7 @@ def one_hot(input, depth, allow_out_of_range=False):
             depth = depth.numpy()
             assert depth.shape == (
                 1, ), "depth of type Variable should have shape [1]"
-            depth = depth[0]
+            depth = depth.item(0)
         out = core.ops.one_hot(input, 'depth', depth, 'allow_out_of_range',
                                allow_out_of_range)
         out.stop_gradient = True
@@ -5994,7 +6035,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     :alias_main: paddle.reshape
 	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
-	:old_api: paddle.fluid.layers.reshape
 
     This operator changes the shape of ``x`` without changing its data.
 
@@ -6037,14 +6077,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         The parameter ``actual_shape`` will be deprecated in the future and only use ``shape`` instead to represent the target shape.
 
     Args:
-        x(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
-        shape(list|tuple|Variable): Define the target shape. At most one dimension of the target shape can be -1.
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
                         The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
-                        If ``shape`` is an Variable, it should be an 1-D Tensor .
+                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
         actual_shape(variable, optional): An 1-D ``Tensor`` or ``LoDTensor`` . The data type is ``int32`` . If provided, reshape
                                 according to this given shape rather than ``shape`` specifying shape.
                                 That is to say ``actual_shape`` has a higher priority
-                                than ``shape(list|tuple)`` but not ``shape(Variable)``. \
+                                than ``shape(list|tuple)`` but not ``shape(Tensor)``. \
                                 This argument ``actual_shape`` will be removed in a future version. \
                                 Instructions for updating: ``actual_shape`` will be removed in future versions and replaced by ``shape``.
         act (str, optional): The non-linear activation to be applied to the reshaped input. Default None.
@@ -6056,10 +6096,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
+        Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
 
     Raises:
-        TypeError: If actual_shape is neither Variable nor None.
+        TypeError: If actual_shape is neither Tensor nor None.
         ValueError: If more than one elements of ``shape`` is -1.
         ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
         ValueError: If the elements in ``shape`` is negative except -1.
@@ -6070,7 +6110,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             import paddle.fluid as fluid
 
             # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
+            # attr shape is a list which doesn't contain Tensors.
             data_1 = fluid.data(
               name='data_1', shape=[2, 4, 6], dtype='float32')
             reshaped_1 = fluid.layers.reshape(
@@ -6078,7 +6118,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             # the shape of reshaped_1 is [2,4,3,2].
 
             # example 2:
-            # attr shape is a list which contains tensor Variable.
+            # attr shape is a list which contains Tensors.
             data_2 = fluid.layers.fill_constant([2,25], "int32", 3)
             dim = fluid.layers.fill_constant([1], "int32", 5)
             reshaped_2 = fluid.layers.reshape(data_2, shape=[dim, 10])
@@ -6098,7 +6138,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             )
         if isinstance(shape, (list, tuple)):
             shape = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
             out, _ = core.ops.reshape2(x, 'shape', shape)
@@ -6200,7 +6240,7 @@ def squeeze(input, axes, name=None):
             Out.shape = [1,3,5]
 
     Args:
-        input (Variable): The input Tensor. Support data type: float16, float32, float64, int8, int32, int64.
+        input (Variable): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
                           axes (list): One integer or List of integers, indicating the dimensions to be squeezed.
                           Axes range is :math:`[-rank(input), rank(input))`.
                           If axes is negative, :math:`axes=axes+rank(input)`.
@@ -6226,8 +6266,9 @@ def squeeze(input, axes, name=None):
     helper = LayerHelper("squeeze", **locals())
     check_variable_and_dtype(
         input, 'input',
-        ['float16', 'float32', 'float64', 'int8', 'int32', 'int64'], 'squeeze')
-    check_type(axes, 'axes', (list, tuple), 'squeeze')
+        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
+        'squeeze')
+    check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
@@ -6254,12 +6295,12 @@ def unsqueeze(input, axes, name=None):
       then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
 
     Args:
-        input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
+        input (Variable): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
         axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
         name (str|None): Name for this layer.
 
     Returns:
-        Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
+        Variable: Unsqueezed Tensor, with the same data type as input.
 
     Examples:
         .. code-block:: python
@@ -6269,10 +6310,15 @@ def unsqueeze(input, axes, name=None):
             y = fluid.layers.unsqueeze(input=x, axes=[1])
 
     """
-    if not isinstance(axes, (int, list, tuple, Variable)):
-        raise TypeError(
-            "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
-            "received %s." % (type(axes)))
+    if in_dygraph_mode():
+        out, _ = core.ops.unsqueeze2(input, 'axes', axes)
+        return out
+
+    check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
+    check_variable_and_dtype(
+        input, 'input',
+        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
+        'unsqueeze')
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
     attrs = {}
@@ -8164,9 +8210,9 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
+@deprecated(since="2.0.0", update_to="paddle.gather")
 def gather(input, index, overwrite=True):
     """
-    **Gather Layer**
 
     Output is obtained by gathering entries of the outer-most dimension
     of X indexed by `index` and concatenate them together.
@@ -8193,19 +8239,21 @@ def gather(input, index, overwrite=True):
                        [5, 6]]
 
     Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is
+        input (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
             float16 (only for GPU).
-        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
+        index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
         overwrite (bool, optional): The mode that updating the grad when has same index.
             If True, use the overwrite mode to update the grad of the same index,
 	    if False, use the accumulate mode to update the grad of the same index.
 	    Default value is True.
 
-
-
     Returns:
-        output (Variable): The output is a tensor with the same rank as input.
+        output (Tensor): The output is a tensor with the same rank as input.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
 
     Examples:
 
@@ -8216,6 +8264,13 @@ def gather(input, index, overwrite=True):
             index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
             output = fluid.layers.gather(x, index)
     """
+    if in_dygraph_mode():
+        return core.ops.gather(input, index, None)
+
+    check_variable_and_dtype(
+        input, 'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], 'gather')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
@@ -8228,6 +8283,7 @@ def gather(input, index, overwrite=True):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.gather_nd")
 def gather_nd(input, index, name=None):
     """
     **Gather Nd Layer**
@@ -8280,14 +8336,18 @@ def gather_nd(input, index, name=None):
                          = [23]
 
     Args:
-        input (Variable): The source input. Its dtype should be int32, int64, float32, float64.
-        index (Variable): The index input with rank > 1, index.shape[-1] <= input.rank.
-                          Its dtype should be int32, int64.
-        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically.
+        input (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
+        index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
+                        Its dtype should be int32, int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        output (Variable): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+        output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+    
+    Raises:
+        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of float32, float64, int32 and int64.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
 
     Examples:
 
@@ -8299,6 +8359,12 @@ def gather_nd(input, index, name=None):
             output = fluid.layers.gather_nd(x, index)
 
     """
+    if in_dygraph_mode():
+        return core.ops.gather_nd(input, index)
+    check_variable_and_dtype(input, 'input',
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'gather_np')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
@@ -8310,6 +8376,7 @@ def gather_nd(input, index, name=None):
     return output
 
 
+@deprecated(since="2.0.0", update_to="paddle.scatter")
 def scatter(input, index, updates, name=None, overwrite=True):
     """
     :alias_main: paddle.scatter
@@ -8625,7 +8692,7 @@ def log(x, name=None):
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu")
 def relu(x, name=None):
     """
     ${comment}
@@ -8667,11 +8734,9 @@ def relu(x, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.selu")
 def selu(x, scale=None, alpha=None, name=None):
     """
-    :alias_main: paddle.nn.functional.selu
-	:alias: paddle.nn.functional.selu,paddle.nn.functional.activation.selu
-	:old_api: paddle.fluid.layers.selu
 
     Selu Operator.
 
@@ -9286,7 +9351,7 @@ def pad2d(input,
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.elu")
 def elu(x, alpha=1.0, name=None):
     """
     :alias_main: paddle.nn.functional.elu
@@ -9328,12 +9393,9 @@ def elu(x, alpha=1.0, name=None):
     return out
 
 
-@templatedoc()
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu6")
 def relu6(x, threshold=6.0, name=None):
     """
-    :alias_main: paddle.nn.functional.relu6
-	:alias: paddle.nn.functional.relu6,paddle.nn.functional.activation.relu6
-	:old_api: paddle.fluid.layers.relu6
 
     ${comment}
 
@@ -9369,7 +9431,10 @@ def relu6(x, threshold=6.0, name=None):
         type='relu6',
         inputs={'X': x},
         outputs={'Out': out},
-        attrs={'threshold': threshold})
+        attrs={
+            'threshold': threshold,
+            'use_mkldnn': core.globals()["FLAGS_use_mkldnn"]
+        })
     return out
 
 
@@ -9602,6 +9667,7 @@ def swish(x, beta=1.0, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.prelu")
 def prelu(x, mode, param_attr=None, name=None):
     """
     :api_attr: Static Graph
@@ -9661,7 +9727,8 @@ def prelu(x, mode, param_attr=None, name=None):
         ) >= 2, "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
         #NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
         # To be consistent with Prelu, it is simplified.
-        alpha_shape = [1, x.shape[1]]
+        #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+        alpha_shape = [1, x.shape[1], 1, 1]
     elif mode == 'element':
         assert len(
             x.shape
@@ -9729,13 +9796,10 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.leaky_relu")
 @templatedoc()
 def leaky_relu(x, alpha=0.02, name=None):
     """
-    :alias_main: paddle.nn.functional.leaky_relu
-	:alias: paddle.nn.functional.leaky_relu,paddle.nn.functional.activation.leaky_relu
-	:old_api: paddle.fluid.layers.leaky_relu
-
     ${comment}
     Args:
         x(${x_type}): ${x_comment}
@@ -9764,19 +9828,7 @@ def leaky_relu(x, alpha=0.02, name=None):
             res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
             print(res_val) # [[-0.1, 2], [3, -0.4]]
     """
-    if in_dygraph_mode():
-        return core.ops.leaky_relu(x, 'alpha', alpha)
-
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'leaky_relu')
-
-    inputs = {'X': [x]}
-    attrs = {'alpha': alpha}
-    helper = LayerHelper('leaky_relu', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='leaky_relu', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    return out
+    return paddle.nn.functional.leaky_relu(x, alpha, name)
 
 
 def soft_relu(x, threshold=40.0, name=None):
@@ -9911,7 +9963,7 @@ def flatten(x, axis=1, name=None):
     return out
 
 
-def stack(x, axis=0):
+def stack(x, axis=0, name=None):
     """
 
     This OP stacks all the inputs :code:`x` along axis.
@@ -9966,7 +10018,7 @@ def stack(x, axis=0):
                                      must be the same. Supposing input is N dims
                                      Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
-                                     Support data types: float32, float64, int32, int64.
+                                     Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
                               R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
                               The default value of axis is 0.
@@ -9991,15 +10043,16 @@ def stack(x, axis=0):
             data = layers.stack(x1)  # stack according to axis 0, data.shape=[1, None, 1, 2]
 
     """
-
-    helper = LayerHelper('stack', **locals())
     axis = 0 if axis is None else axis
-
     if not isinstance(x, list) and not isinstance(x, tuple):
         x = [x]
+
+    if in_dygraph_mode():
+        return core.ops.stack(x, 'axis', axis)
+
+    helper = LayerHelper('stack', **locals())
     out = helper.create_variable_for_type_inference(x[0].dtype)
-    if not in_dygraph_mode() and \
-            x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+    if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
                             "number of the elements must be 1, but received %s." % len(x)
         out_index = helper.create_variable_for_type_inference(dtype="int32")
@@ -10114,12 +10167,12 @@ def unstack(x, axis=0, num=None):
     raised.
 
     Args:
-        x (Variable): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64.
+        x (Tensor): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64.
         axis (int): The axis along which the input is unstacked.
         num (int|None): The number of output variables.
 
     Returns:
-        list(Variable): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
+        list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
 
     Raises:
         ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D).
@@ -10128,7 +10181,7 @@ def unstack(x, axis=0, num=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
             y = fluid.layers.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
@@ -10215,7 +10268,7 @@ def expand(x, expand_times, name=None):
     if in_dygraph_mode():
         if isinstance(expand_times, (list, tuple)):
             expand_times = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in expand_times
             ]
 
@@ -10340,6 +10393,7 @@ def expand_as(x, target_tensor, name=None):
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 
+@deprecated(since='1.8.0', update_to="paddle.uniform")
 @templatedoc()
 def uniform_random_batch_size_like(input,
                                    shape,
@@ -10435,6 +10489,7 @@ def uniform_random_batch_size_like(input,
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.normal")
 @templatedoc()
 def gaussian_random(shape,
                     mean=0.0,
@@ -10609,6 +10664,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     return out
 
 
+@deprecated(since='1.8.0', update_to="paddle.normal")
 @templatedoc()
 def gaussian_random_batch_size_like(input,
                                     shape,
@@ -10826,11 +10882,11 @@ def slice(input, axes, starts, ends):
         if isinstance(starts, (list, tuple)) and isinstance(ends,
                                                             (list, tuple)):
             starts = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in starts
             ]
             ends = [
-                item.numpy()[0] if isinstance(item, Variable) else item
+                item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in ends
             ]
 
@@ -11197,6 +11253,7 @@ def rank(input):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.numel")
 def size(input):
     """
     **Size Layer**
@@ -11204,11 +11261,14 @@ def size(input):
     Returns the number of elements for a tensor, which is a int64 Tensor with shape [1].
 
     Args:
-        input (Variable): The input variable.
+        input (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
 
     Returns:
-        Variable: The number of elements for the input variable.
+        Tensor: The number of elements for the input Tensor.
 
+    Raises:
+        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
+    
     Examples:
         .. code-block:: python
 
@@ -11219,6 +11279,11 @@ def size(input):
             rank = layers.size(input) # 300
     """
 
+    if in_dygraph_mode():
+        return core.ops.size(x)
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        "size")
     helper = LayerHelper('size', **locals())
     out = helper.create_variable_for_type_inference(dtype='int64')
     helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
@@ -11434,11 +11499,17 @@ Examples:
     """
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_add')
+            x,
+            y,
+            axis=axis,
+            act=act,
+            op_name='elementwise_add',
+            use_mkldnn=core.globals()["FLAGS_use_mkldnn"])
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.divide")
 def elementwise_div(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_div
@@ -11862,6 +11933,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.remainder")
 def elementwise_mod(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_mod
@@ -11899,6 +11971,7 @@ Examples:
     return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
 
 
+@deprecated(since="2.0.0", update_to="paddle.floor_divide")
 def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
     """
     :alias_main: paddle.elementwise_floordiv
@@ -11962,7 +12035,7 @@ for func in [
         ],
         skip_attrs_set={
             "x_data_format", "y_data_format", "axis", "use_quantizer",
-            "Scale_x", "Scale_y", "Scale_out"
+            "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
         }) + """\n""" + str(func.__doc__)
 
 for func in []:
@@ -12066,15 +12139,18 @@ def logical_and(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             x_data = np.array([True, True, False, False], dtype=np.bool)
             y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
-            y = paddle.imperative.to_variable(y_data)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
             res = paddle.logical_and(x, y)
             print(res.numpy()) # [True False False False]
     """
-
+    if x.shape != y.shape:
+        raise TypeError(
+            'Input tensors must be same shape, but received x \'s shape: %s, y \'s shape: %s '
+            % (x.shape, y.shape))
     return _logical_op(
         op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
 
@@ -12108,15 +12184,18 @@ def logical_or(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             x_data = np.array([True, True, False, False], dtype=np.bool)
             y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
-            y = paddle.imperative.to_variable(y_data)
+            x = paddle.to_variable(x_data)
+            y = paddle.to_variable(y_data)
             res = paddle.logical_or(x, y)
             print(res.numpy()) # [True  True  True False]
     """
-
+    if x.shape != y.shape:
+        raise TypeError(
+            'Input tensors must be same shape, but received x \'s shape: %s, y \'s shape: %s '
+            % (x.shape, y.shape))
     return _logical_op(
         op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
 
@@ -12150,15 +12229,18 @@ def logical_xor(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             x_data = np.array([True, True, False, False], dtype=np.bool)
             y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
-            y = paddle.imperative.to_variable(y_data)
+            x = paddle.to_variable(x_data)
+            y = paddle.to_variable(y_data)
             res = paddle.logical_xor(x, y)
             print(res.numpy()) # [False  True  True False]
     """
-
+    if x.shape != y.shape:
+        raise TypeError(
+            'Input tensors must be same shape, but received x \'s shape: %s, y \'s shape: %s '
+            % (x.shape, y.shape))
     return _logical_op(
         op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
 
@@ -12190,9 +12272,9 @@ def logical_not(x, out=None, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             x_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.imperative.to_variable(x_data)
+            x = paddle.to_variable(x_data)
             res = paddle.logical_not(x)
             print(res.numpy()) # [False  True False  True]
     """
@@ -12204,8 +12286,6 @@ def logical_not(x, out=None, name=None):
 @templatedoc()
 def clip(x, min, max, name=None):
     """
-    :alias_main: paddle.nn.clip
-	:alias: paddle.nn.clip,paddle.nn.clip.clip
 	:old_api: paddle.fluid.layers.clip
 
     ${comment}
@@ -12300,13 +12380,10 @@ def clip_by_norm(x, max_norm, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.mean")
 @templatedoc()
 def mean(x, name=None):
     """
-    :alias_main: paddle.mean
-	:alias: paddle.mean,paddle.tensor.mean,paddle.tensor.stat.mean
-	:old_api: paddle.fluid.layers.mean
-
     ${comment}
 
     Args:
@@ -12324,6 +12401,7 @@ def mean(x, name=None):
                 name='data', shape=[2, 3], dtype='float32')
             mean = fluid.layers.mean(input)
     """
+
     if in_dygraph_mode():
         return core.ops.mean(x)
 
@@ -13984,12 +14062,9 @@ def where(condition):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.sign")
 def sign(x):
     """
-    :alias_main: paddle.sign
-	:alias: paddle.sign,paddle.tensor.sign,paddle.tensor.math.sign
-	:old_api: paddle.fluid.layers.sign
-
     This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 3adb243c8f83d0dc0d8c89daf5a630cb4bdce1fd..84cacea6ba5723f8a06fc87fa9c59d96f802e65a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,18 +14,26 @@
 
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_activation_fn
+from .layer_function_generator import generate_layer_fn, generate_activation_fn, add_sample_code
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from paddle.utils import deprecated
+
+__deprecated_func_name__ = {'tanh_shrink': 'tanhshrink', }
 
 __activations_noattr__ = [
     'sigmoid',
     'logsigmoid',
-    'exp',
+    'tanh_shrink',
+    'softplus',
+    'softsign',
     'tanh',
+]
+
+__unary_func__ = [
+    'exp',
     'atan',
-    'tanh_shrink',
     'sqrt',
     'rsqrt',
     'abs',
@@ -33,15 +41,13 @@ __activations_noattr__ = [
     'floor',
     'cos',
     'acos',
-    'asin',
     'sin',
     'sinh',
+    'asin',
     'cosh',
     'round',
     'reciprocal',
     'square',
-    'softplus',
-    'softsign',
 ]
 
 __all__ = []
@@ -57,9 +63,375 @@ globals()['_scale'] = generate_layer_fn('scale')
 globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 
 __all__ += __activations_noattr__
+__all__ += __unary_func__
 
 for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_activation_fn(_OP)
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    func = generate_activation_fn(_OP)
+    func = deprecated(
+        since="2.0.0", update_to="paddle.nn.functional.%s" % (_new_OP))(func)
+    globals()[_OP] = func
+
+for _OP in set(__unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    func = generate_activation_fn(_OP)
+    func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
+    globals()[_OP] = func
+
+add_sample_code(globals()["sigmoid"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = F.sigmoid(x)
+        print(out.numpy())
+        # [0.40131234 0.450166   0.52497919 0.57444252]
+
+""")
+
+add_sample_code(globals()["logsigmoid"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = F.logsigmoid(x)
+        print(out.numpy())
+        # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
+
+""")
+
+add_sample_code(globals()["exp"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.exp(x)
+        print(out.numpy())
+        # [0.67032005 0.81873075 1.10517092 1.34985881]
+
+""")
+
+add_sample_code(globals()["tanh"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.tanh(x)
+        print(out.numpy())
+        # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+
+""")
+
+add_sample_code(globals()["atan"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.atan(x)
+        print(out.numpy())
+        # [-0.38050638 -0.19739556  0.09966865  0.29145679]
+
+""")
+
+add_sample_code(globals()["tanh_shrink"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+
+""")
+
+add_sample_code(globals()["sqrt"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([0.1, 0.2, 0.3, 0.4])
+        x = paddle.to_variable(x_data)
+        out = paddle.sqrt(x)
+        print(out.numpy())
+        # [0.31622777 0.4472136  0.54772256 0.63245553]
+
+""")
+
+add_sample_code(globals()["rsqrt"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([0.1, 0.2, 0.3, 0.4])
+        x = paddle.to_variable(x_data)
+        out = paddle.rsqrt(x)
+        print(out.numpy())
+        # [3.16227766 2.23606798 1.82574186 1.58113883]
+
+""")
+
+add_sample_code(globals()["abs"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.abs(x)
+        print(out.numpy())
+        # [0.4 0.2 0.1 0.3]
+
+""")
+
+add_sample_code(globals()["ceil"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.ceil(x)
+        print(out.numpy())
+        # [-0. -0.  1.  1.]
+
+""")
+
+add_sample_code(globals()["floor"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.floor(x)
+        print(out.numpy())
+        # [-1. -1.  0.  0.]
+
+""")
+
+add_sample_code(globals()["cos"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.cos(x)
+        print(out.numpy())
+        # [0.92106099 0.98006658 0.99500417 0.95533649]
+
+""")
+
+add_sample_code(globals()["acos"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.acos(x)
+        print(out.numpy())
+        # [1.98231317 1.77215425 1.47062891 1.26610367]
+
+""")
+
+add_sample_code(globals()["sin"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.sin(x)
+        print(out.numpy())
+        # [-0.38941834 -0.19866933  0.09983342  0.29552021]
+
+""")
+
+add_sample_code(globals()["asin"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.asin(x)
+        print(out.numpy())
+        # [-0.41151685 -0.20135792  0.10016742  0.30469265]
+
+""")
+
+add_sample_code(globals()["cosh"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.cosh(x)
+        print(out.numpy())
+        # [1.08107237 1.02006676 1.00500417 1.04533851]
+
+""")
+
+add_sample_code(globals()["sinh"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.sinh(x)
+        print(out.numpy())
+        # [-0.41075233 -0.201336    0.10016675  0.30452029]
+
+""")
+
+add_sample_code(globals()["round"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.5, -0.2, 0.6, 1.5])
+        x = paddle.to_variable(x_data)
+        out = paddle.round(x)
+        print(out.numpy())
+        # [-1. -0.  1.  2.]
+
+""")
+
+add_sample_code(globals()["reciprocal"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.reciprocal(x)
+        print(out.numpy())
+        # [-2.5        -5.         10.          3.33333333]
+
+""")
+
+add_sample_code(globals()["square"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.disable_static()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_variable(x_data)
+        out = paddle.square(x)
+        print(out.numpy())
+        # [0.16 0.04 0.01 0.09]
+
+""")
+
+add_sample_code(globals()["softplus"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+
+""")
+
+add_sample_code(globals()["softsign"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+
+""")
 
 __all__ += ['softshrink']
 
@@ -116,6 +488,7 @@ __all__ += ['hard_shrink']
 _hard_shrink_ = generate_layer_fn('hard_shrink')
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.hardshrink")
 def hard_shrink(x, threshold=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_shrink')
@@ -129,10 +502,6 @@ def hard_shrink(x, threshold=None):
 
 
 hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
-	:alias_main: paddle.nn.functional.hard_shrink
-	:alias: paddle.nn.functional.hard_shrink,paddle.nn.functional.activation.hard_shrink
-	:old_api: paddle.fluid.layers.hard_shrink
-
 Examples:
 
     >>> import paddle.fluid as fluid
@@ -145,6 +514,10 @@ __all__ += ['cumsum']
 _cum_sum_ = generate_layer_fn('cumsum')
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.cumsum",
+    reason="New APIs for Paddle 2.0 are coming.")
 def cumsum(x, axis=None, exclusive=None, reverse=None):
     check_type(x, 'x', (Variable), 'cumsum')
     locals_var = locals().copy()
@@ -274,6 +647,7 @@ __all__ += ['gelu']
 _gelu_ = generate_layer_fn('gelu')
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.gelu")
 def gelu(x, approximate=False):
     locals_var = locals().copy()
     kwargs = dict()
@@ -284,10 +658,6 @@ def gelu(x, approximate=False):
 
 
 gelu.__doc__ = """
-	:alias_main: paddle.nn.functional.gelu
-	:alias: paddle.nn.functional.gelu,paddle.nn.functional.activation.gelu
-	:old_api: paddle.fluid.layers.gelu
-
 :strong:`GeLU Activation Operator`
 For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
 
@@ -362,7 +732,7 @@ __all__ += ['erf']
 _erf_ = generate_layer_fn('erf')
 
 
-def erf(x):
+def erf(x, name=None):
     locals_var = locals().copy()
     kwargs = dict()
     for name, val in locals_var.items():
@@ -372,10 +742,6 @@ def erf(x):
 
 
 erf.__doc__ = """
-	:alias_main: paddle.erf
-	:alias: paddle.erf,paddle.tensor.erf,paddle.tensor.math.erf,paddle.nn.functional.erf,paddle.nn.functional.activation.erf
-	:old_api: paddle.fluid.layers.erf
-
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
 
@@ -385,57 +751,22 @@ Equation:
 
 Args:
 
-    x(Variable): The input of Erf op, Tensor or LoDTensor, dtype: float32 or float64.
+    x (Tensor): The input tensor, it's data type should be float32, float64.
 
 Returns:
 
-    Variable: The output of Erf op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
+    Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
     
     .. code-block:: python
     
-        # declarative mode
         import numpy as np
-        from paddle import fluid
-        
-        x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
-        y = fluid.layers.erf(x)
-        
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        start = fluid.default_startup_program()
-        main = fluid.default_main_program()
-        
-        data = np.random.randn(2, 3).astype("float32")
-        exe.run(start)
-        
-        y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        
-        data
-        # array([[ 0.4643714 , -1.1509596 ,  1.2538221 ],
-        #        [ 0.34369683,  0.27478245,  1.1805398 ]], dtype=float32)
-        y_np
-        # array([[ 0.48863927, -0.8964121 ,  0.9237998 ],
-        #        [ 0.37307587,  0.30242872,  0.9049887 ]], dtype=float32)
-
-    .. code-block:: python
-    
-        # imperative mode
-        import numpy as np
-        from paddle import fluid
-        import paddle.fluid.dygraph as dg
-        
-        data = np.random.randn(2, 3).astype("float32")
-        place = fluid.CPUPlace()
-        with dg.guard(place) as g:
-            x = dg.to_variable(data)
-            y = fluid.layers.erf(x)
-            y_np = y.numpy()
-        data
-        # array([[ 0.4643714 , -1.1509596 ,  1.2538221 ],
-        #        [ 0.34369683,  0.27478245,  1.1805398 ]], dtype=float32)
-        y_np
-        # array([[ 0.48863927, -0.8964121 ,  0.9237998 ],
-        #        [ 0.37307587,  0.30242872,  0.9049887 ]], dtype=float32)
+        import paddle
+        paddle.disable_static()
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.to_tensor(x_data)
+        out = paddle.erf(x)
+        print(out.numpy())
+        # [-0.42839236 -0.22270259  0.11246292  0.32862676]
 """
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index ecc58768522831b55f620cb6dc911630e2c2ad68..bc1368b562d7b354ce34dc87679fd8a0c5a3d012 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -127,7 +127,8 @@ class RNNCell(object):
         else:
             integer_types = (int, )
         check_variable_and_dtype(batch_ref, 'batch_ref',
-                                 ['float32', 'float64'], 'RNNCell')
+                                 ['float32', 'float64', 'int32', 'int64'],
+                                 'RNNCell')
         check_type(shape, 'shape', (list, tuple, type(None), integer_types),
                    'RNNCell')
         if isinstance(shape, (list, tuple)):
@@ -2212,9 +2213,9 @@ def lstm(input,
         input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64
         init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
                        If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
+        max_len (int): This parameter has no effect and will be discarded.
         init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
                        If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len.
         hidden_size (int): hidden size of the LSTM.
         num_layers (int): total layers number of the LSTM.
         dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -2255,7 +2256,6 @@ def lstm(input,
             data = fluid.data(name='x', shape=[None, 100], dtype='int64')
             emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
             batch_size = 20
-            max_len = 100
             dropout_prob = 0.2
             input_size = 100
             hidden_size = 150
@@ -2308,9 +2308,11 @@ def lstm(input,
     out = helper.create_variable_for_type_inference(dtype)
     last_h = helper.create_variable_for_type_inference(dtype)
     last_c = helper.create_variable_for_type_inference(dtype)
-
-    cache = helper.create_variable(
-        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
+    reserve = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+    state_out = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+    state_out.persistable = True
 
     helper.append_op(
         type='cudnn_lstm',
@@ -2319,15 +2321,15 @@ def lstm(input,
             'InitH': init_h,
             'InitC': init_c,
             'W': weight,
-            'Cache': cache,
         },
         outputs={
             'Out': out,
-            'last_h': last_h,
-            'last_c': last_c,
+            'LastH': last_h,
+            'LastC': last_c,
+            'Reserve': reserve,
+            'StateOut': state_out,
         },
         attrs={
-            'max_len': max_len,
             'is_bidirec': is_bidirec,
             'input_size': input_size,
             'hidden_size': hidden_size,
@@ -3101,7 +3103,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
                              'beam_search_encode')
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
-    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
+    sentence_scores = helper.create_variable_for_type_inference(
+        dtype=scores.dtype)
 
     helper.append_op(
         type="beam_search_decode",
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index e33b34cc9254b18a18c293fb3670203fecdeb38f..77a78eb4a14a0a5ad9be9cff71131ca473106ab8 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -26,6 +26,7 @@ from .. import core
 from .layer_function_generator import templatedoc
 from . import utils
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.utils import deprecated
 import numpy
 import warnings
 
@@ -317,7 +318,7 @@ def concat(input, axis=0, name=None):
     if in_dygraph_mode():
         if isinstance(axis, Variable):
             axis = axis.numpy()
-            axis = axis[0]
+            axis = axis.item(0)
         return core.ops.concat(input, 'axis', axis)
 
     check_type(input, 'input', (list, tuple, Variable), 'concat')
@@ -642,7 +643,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         shape(list|tuple|Tensor): Shape of the output Tensor, the data type of ``shape`` is int32 or int64.
             If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Tensor which can
+        dtype(np.dtype|str): Data type of the output Tensor which can
             be float16, float32, float64, int32, int64.
         value(bool|float|int|Tensor): The constant value used to initialize 
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
@@ -685,8 +686,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     """
 
     attrs = {'force_cpu': force_cpu}
+    dtype = convert_dtype(dtype)
     if not isinstance(value, Variable):
-        if convert_dtype(dtype) in ['int64', 'int32']:
+        if dtype in ['int64', 'int32']:
             attrs['str_value'] = str(int(value))
         else:
             attrs['str_value'] = str(float(value))
@@ -697,10 +699,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             out = _varbase_creator(dtype=dtype)
 
         if isinstance(value, Variable):
-            if convert_dtype(dtype) in ['int64', 'int32']:
-                attrs['str_value'] = str(int(value.numpy()))
+            if dtype in ['int64', 'int32']:
+                attrs['str_value'] = str(int(value.numpy().item(0)))
             else:
-                attrs['str_value'] = str(float(value.numpy()))
+                attrs['str_value'] = str(float(value.numpy().item(0)))
 
         core.ops.fill_constant(out, 'value',
                                float(value), 'force_cpu', force_cpu, 'dtype',
@@ -712,6 +714,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     helper = LayerHelper("fill_constant", **locals())
     inputs = {}
     if isinstance(value, Variable):
+        if convert_dtype(value.dtype) != dtype:
+            value = cast(value, dtype)
         inputs['ValueTensor'] = value
 
     check_dtype(dtype, 'dtype',
@@ -743,6 +747,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     return out
 
 
+@deprecated(since='1.8.0', update_to="paddle.fill_constant")
 @templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
@@ -1037,7 +1042,7 @@ def ones(shape, dtype, force_cpu=False):
 
     Parameters:
         shape(tuple|list|Tensor): Shape of output Tensor, the data type of shape is int32 or int64.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of output Tensor, it supports
+        dtype (np.dtype|str): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64.
         force_cpu (bool, optional): Whether force to store the output Tensor in CPU memory.
             If :attr:`force_cpu` is False, the output Tensor will be stored in running device memory.
@@ -1070,7 +1075,7 @@ def zeros(shape, dtype, force_cpu=False, name=None):
 
     Parameters:
         shape(tuple|list|Tensor): Shape of output Tensor, the data type of ``shape`` is int32 or int64.
-        dtype (np.dtype|core.VarDesc.VarType|str): Data type of output Tensor, it supports
+        dtype (np.dtype|str): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64.
         force_cpu (bool, optional): Whether force to store the output Tensor in CPU memory.
             If :attr:`force_cpu` is False, the output Tensor will be stored in running device memory.
@@ -1432,14 +1437,14 @@ def linspace(start, stop, num, dtype=None, name=None):
     This OP return fixed number of evenly spaced values within a given interval.
 
     Args:
-        start(float|Tensor): The input :attr:`start` is start variable of range. It is a float scalar, \
-            or a Tensor of shape [1] with input data type float32, float64.
-        stop(float|Tensor): The input :attr:`stop` is start variable of range. It is a float scalar, \
-            or a Tensor of shape [1] with input data type float32, float64.
+        start(int|float|Tensor): The input :attr:`start` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
+        stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
         num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \
-            or a Tensor of shape [1] with data type int32.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output tensor, it could be 'float32' and 'float64'.
-            Default: if None, the data type is float32.
+            or a Tensor of shape [1] with data type int32 or int64.
+        dtype(np.dtype|str, optional): The data type of output tensor, it could be
+            int32, int64, float32 and float64. Default: if None, the data type is float32.
         name(str, optional): Normally there is no need for user to set this property. 
             For more information, please refer to :ref:`api_guide_Name`.Default: None.
 
@@ -1449,9 +1454,11 @@ def linspace(start, stop, num, dtype=None, name=None):
         the value with input :attr:`start`. 
 
     Raises:
-        TypeError: The ``dtype`` must be one of float32 and float64.
-        TypeError: The data type of ``start`` and ``stop``  must be one of float32 and float64.
-        TypeError: The data type of ``num`` must be one of int32 and int64.
+        TypeError: The ``dtype`` must be one of int32, int64, float32 and float64.
+        TypeError: The type of ``num`` must be int When it's not a Tensor.
+        TypeError: The data type of ``num`` must be int32  When it's  a Tensor.
+        TypeError: The data type of ``start`` and  ``stop`` must be same as ``dtype`` When it's  a Tensor.
+
 
 
     Examples:
@@ -1464,29 +1471,47 @@ def linspace(start, stop, num, dtype=None, name=None):
     """
     if dtype is None:
         dtype = 'float32'
+    tensor_num = num
+    tensor_start = start
+    tensor_stop = stop
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
+        tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        stop = fill_constant([1], dtype, stop)
+        tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        num = fill_constant([1], 'int32', num)
+        tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
-        return core.ops.linspace(start, stop, num)
+        return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
+                                 dtype)
 
     helper = LayerHelper("linspace", **locals())
 
-    check_dtype(start.dtype, 'start', ['float32', 'float64'], 'linspace')
-    check_dtype(stop.dtype, 'stop', ['float32', 'float64'], 'linspace')
-    check_dtype(num.dtype, 'num', ['int32', 'int64'], 'linspace')
-    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'linspace')
+    if isinstance(start, Variable):
+        check_dtype(start.dtype, 'start', (convert_dtype(dtype)), 'linspace')
+    else:
+        check_type(start, 'start', (int, float), 'linspace')
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+    if isinstance(stop, Variable):
+        check_dtype(stop.dtype, 'stop', (convert_dtype(dtype)), 'linspace')
+    else:
+        check_type(stop, 'stop', (int, float), 'linspace')
+    if isinstance(num, Variable):
+        check_dtype(num.dtype, 'num', ['int32'], 'linspace')
+    else:
+        check_type(num, 'num', (int), 'linspace')
+    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
+                'linspace')
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
 
     helper.append_op(
         type='linspace',
-        inputs={'Start': start,
-                'Stop': stop,
-                'Num': num},
+        inputs={'Start': tensor_start,
+                'Stop': tensor_stop,
+                'Num': tensor_num},
+        attrs={'dtype': dtype},
         outputs={'Out': [out]})
     return out
 
@@ -1534,6 +1559,7 @@ def zeros_like(x, out=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.diag")
 def diag(diagonal):
     """
 	:alias_main: paddle.diag
@@ -1595,7 +1621,7 @@ def eye(num_rows,
             If None, default: num_rows.
         batch_shape(list, optional): If provided, the returned tensor will have a leading
             batch size of this shape, the data type of ``batch_shape`` is int. Default is None.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned tensor.
+        dtype(np.dtype|str, optional): The data type of the returned tensor.
             It should be int32, int64, float16, float32, float64, default is 'float32'.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 757d2189904e940ef6582c4a963aaadaf8b88c2f..a41f23e2dde3e87ad3bdf750d87456cfb973a9d3 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -40,6 +40,7 @@ from paddle.fluid.layers import tensor
 from functools import reduce
 from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
+import paddle
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@@ -61,21 +62,23 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def __init__(self,
                  learning_rate,
                  parameter_list=None,
                  regularization=None,
                  grad_clip=None,
                  name=None):
+        # Because of the loop import, so place it in the function body
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         self._parameter_list = list(
             parameter_list) if parameter_list is not None else None
         self._name = name
         if framework.in_dygraph_mode():
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, LearningRateDecay):
+            if not isinstance(learning_rate,
+                              (float, LearningRateDecay, _LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or LearningRateDecay, got %s here"
+                    "learning rate should be float or _LRScheduler, got %s here"
                     % type(learning_rate))
             if self._parameter_list is None:
                 raise AttributeError(
@@ -90,11 +93,11 @@ class Optimizer(object):
                             % regularization.__str__())
                         break
         else:
-            if not isinstance(learning_rate, float) and \
-                    not isinstance(learning_rate, framework.Variable):
+            if not isinstance(learning_rate,
+                              (float, framework.Variable, _LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or Variable, got %s here" %
-                    type(learning_rate))
+                    "learning rate should be float or _LRScheduler, got %s here"
+                    % type(learning_rate))
 
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
@@ -144,11 +147,15 @@ class Optimizer(object):
                     state_dict = adam.state_dict()
 
         '''
+        from paddle.optimizer.lr_scheduler import _LRScheduler
         state_dict = {}
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
+        if isinstance(self._learning_rate, _LRScheduler):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+            return state_dict
         if isinstance(self._learning_rate, LearningRateDecay):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
 
@@ -192,6 +199,9 @@ class Optimizer(object):
                     adam.set_dict(opti_state_dict)
 
         '''
+        from paddle.optimizer.lr_scheduler import _LRScheduler
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
         if isinstance(self._learning_rate, LearningRateDecay):
             self._learning_rate.set_dict(state_dict["LR_Scheduler"])
@@ -252,6 +262,30 @@ class Optimizer(object):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
+        from paddle.optimizer.lr_scheduler import _LRScheduler
+        if isinstance(self._learning_rate, _LRScheduler):
+            lr_var = self._global_learning_rate()
+            # only create global lr_var once
+            if not isinstance(lr_var, framework.Variable):
+                lr_name = unique_name.generate('learning_rate')
+                self._learning_rate._var_name = lr_name
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype='float32' if self._dtype is None else self._dtype)
+                main_prog = framework.default_main_program()
+                main_prog.lr_sheduler = self._learning_rate
+                main_prog.lr_var = lr_var
+                self._learning_rate_map[framework.default_main_program(
+                )] = lr_var
+
+            lr_value = float(self._learning_rate())
+            self.helper.set_variable_initializer(
+                lr_var, initializer=Constant(value=lr_value))
+            return
+
         if imperative_base.enabled():
             # create learning rate Variable
             if isinstance(self._learning_rate, float):
@@ -755,7 +789,7 @@ class Optimizer(object):
                 params_grads = append_backward(loss, parameter_list,
                                                act_no_grad_set, callbacks)
                 # Note: since we can't use all_reduce_op now,
-                #  dgc_op should be the last op of one grad.
+                # dgc_op should be the last op of one grad.
                 self._append_dgc_ops(params_grads)
         return params_grads
 
@@ -864,7 +898,7 @@ class Optimizer(object):
             if p.trainable:
                 p.clear_gradient()
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -982,7 +1016,7 @@ class SGDOptimizer(Optimizer):
             name=name)
         self.type = "sgd"
 
-    @no_grad
+    @no_grad()
     def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
@@ -1519,7 +1553,7 @@ class DGCMomentumOptimizer(Optimizer):
         dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
                          [param_var.name, grad_var.name])
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def apply_gradients(self, params_grads):
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
@@ -3685,7 +3719,8 @@ class PipelineOptimizer(object):
     def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         if framework.in_dygraph_mode():
             raise Exception("In dygraph, don't support PipelineOptimizer.")
-        if not isinstance(optimizer, Optimizer):
+        if not isinstance(optimizer, Optimizer) and not isinstance(
+                optimizer, paddle.optimizer.Optimizer):
             raise ValueError("The 'optimizer' parameter for "
                              "PipelineOptimizer must be an instance of "
                              "Optimizer, but the given type is {}.".format(
@@ -4841,6 +4876,11 @@ class LookaheadOptimizer(object):
 
             mod = layers.elementwise_mod(step, k)
             with layers.control_flow.Switch() as switch:
+                with switch.case(step == one_var):
+                    for param_name in params:
+                        fast_var = main_block.var(param_name)
+                        slow_var = param_to_slow[param_name]
+                        layers.assign(input=fast_var, output=slow_var)
                 with switch.case(mod == zero_var):
                     for param_name in params:
                         fast_var = main_block.var(param_name)
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a45443632b04835bc8f3b3f2f167433c7a8b49d4..8e0470bededd4fdb8aec03893590bdba35bbb364 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -204,6 +204,9 @@ class WeightNormParamAttr(ParamAttr):
     """
 	:api_attr: Static Graph
 
+    Note:
+        Please use 'paddle.nn.utils.weight_norm' in dygraph mode.
+
     Parameter of weight Norm. Weight Norm is a reparameterization of the weight vectors
     in a neural network that decouples the magnitude of those weight vectors from
     their direction. Weight Norm has been implemented as discussed in this
@@ -216,6 +219,7 @@ class WeightNormParamAttr(ParamAttr):
         It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
         There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
+        
 
     Args:
         dim(int): Dimension over which to compute the norm. Dim is a non-negative
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 0289ecea34acf65d01aa13b555ee523f7127b48d..76c95be75d67d60cd59efe13ecba6f01a1c1d614 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -22,13 +22,13 @@ from .framework import Program, Variable, program_guard, default_main_program, d
 from .executor import global_scope
 from .data_feeder import DataFeeder, BatchedTensorProvider
 from .multiprocess_utils import multiprocess_queue_set, CleanupFuncRegistrar, _cleanup_mmap, _cleanup, _set_SIGCHLD_handler
-from .dataloader import BatchSampler, Dataset
-from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, default_collate_fn
+from .dataloader import BatchSampler, Dataset, IterableDataset
+from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, _DatasetKind, default_collate_fn
+from .dataloader.batch_sampler import _InfiniteIterableSampler
 from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
 from .unique_name import UniqueNameGenerator
 import logging
 import warnings
-from .dataset import DatasetBase, InMemoryDataset
 
 ### Dygraph DataLoader configs ###
 import os
@@ -49,6 +49,7 @@ __all__ = ['PyReader', 'DataLoader', 'default_collate_fn']
 data_loader_unique_name_generator = UniqueNameGenerator()
 
 KEEP_DATA_LOADER_ORDER = True
+USE_PINNED_MEMORY = None
 
 
 def keep_data_loader_order(*args):
@@ -60,6 +61,15 @@ def keep_data_loader_order(*args):
         KEEP_DATA_LOADER_ORDER = args[0]
 
 
+def use_pinned_memory(*args):
+    global USE_PINNED_MEMORY
+    if len(args) == 0:
+        return USE_PINNED_MEMORY
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        USE_PINNED_MEMORY = args[0]
+
+
 def _convert_places(places):
     if not isinstance(places, (list, tuple)):
         places = [places]
@@ -127,8 +137,9 @@ class DataLoader(object):
 
     Args:  
         dataset(Dataset): the dataset to load data from, should be an
-            instance of subclass of :code:`paddle.io.Dataset`.
-        feed_list (list(Variable)|tuple(Variable)): feed variable list.
+            instance of subclass of :code:`paddle.io.Dataset` or
+            :code:`paddle.io.IterableDataset`.
+        feed_list (list(Tensor)|tuple(Tensor)): feed variable list.
             The variables should be created by :code:`fluid.data()`.
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
@@ -286,6 +297,10 @@ class DataLoader(object):
 
             # -------------------------------------------------------
 
+    .. note::
+        For reading iterable dataset with multiprocess Dataloader,
+        please see :code:`paddle.io.IterableDataset`
+
     """
 
     def __init__(self,
@@ -339,6 +354,18 @@ class DataLoader(object):
         assert timeout >= 0, "timeout should be a non-negative value"
         self.timeout = timeout
 
+        if isinstance(dataset, IterableDataset):
+            self.dataset_kind = _DatasetKind.ITER
+            if shuffle:
+                raise ValueError(
+                    "IterableDataset not support shuffle, but got shuffle={}".
+                    format(shuffle))
+            if batch_sampler is not None:
+                raise ValueError(
+                    "IterableDataset expect unspecified batch_sampler")
+        else:
+            self.dataset_kind = _DatasetKind.MAP
+
         if batch_sampler is not None:
             assert isinstance(batch_sampler, BatchSampler), \
                 "batch_sampler should be None or subclass instance " \
@@ -351,11 +378,20 @@ class DataLoader(object):
             assert batch_size is not None and batch_size > 0, \
                 "batch_size should be a positive value when " \
                 "batch_sampler is not given"
-            self.batch_sampler = BatchSampler(
-                dataset=dataset,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
+            if isinstance(dataset, IterableDataset):
+                self.batch_sampler = _InfiniteIterableSampler(dataset,
+                                                              batch_size)
+            else:
+                self.batch_sampler = BatchSampler(
+                    dataset=dataset,
+                    batch_size=batch_size,
+                    shuffle=shuffle,
+                    drop_last=drop_last)
+
+        self.pin_memory = False
+        if in_dygraph_mode():
+            self.pin_memory = True if use_pinned_memory(
+            ) is None else use_pinned_memory()
 
     def __len__(self):
         return len(self.batch_sampler)
@@ -715,6 +751,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
         # mode, this thread is used to get next batch data from self._batch_reader, then 
         # push it into self._blocking_queue
         self._thread = None
+        self._pin_memory = True if use_pinned_memory(
+        ) is None else use_pinned_memory()
 
     @property
     def queue(self):
@@ -760,7 +798,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
         self._reader = None
         self._reader = core.create_py_reader(
             self.queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_double_buffer, True)
+            self._need_check_feed, self._places, self._use_double_buffer, True,
+            self._pin_memory)
 
     def _start(self):
         if self._use_multiprocess:
@@ -1000,7 +1039,7 @@ class GeneratorLoader(DataLoaderBase):
         self._reader = core.create_py_reader(
             self.queue, self._var_names, self._shapes, self._dtypes,
             self._need_check_feed, self._places, self._use_double_buffer,
-            self._drop_last)
+            self._drop_last, False)
 
     def _init_non_iterable(self):
         lod_levels = []
@@ -1670,7 +1709,7 @@ class PyReader(DataLoaderBase):
 
 class DatasetLoader(DataLoaderBase):
     def __init__(self, dataset, places, drop_last):
-        assert isinstance(dataset,
+        assert isinstance(dataset, paddle.distributed.fleet.dataset.
                           DatasetBase), "dataset must be type of DatasetBase"
         assert not in_dygraph_mode(
         ), "DatasetLoader is not supported in dygraph mode yet"
@@ -1686,7 +1725,7 @@ class DatasetLoader(DataLoaderBase):
 
         dataset.set_thread(thread_num)
 
-        if isinstance(dataset,
+        if isinstance(dataset, paddle.distributed.fleet.dataset.
                       InMemoryDataset) and dataset.queue_num > thread_num:
             logging.warn("queue_num {} which is set in Dataset is ignored".
                          format(dataset.queue_num))
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
old mode 100755
new mode 100644
index d73b9511b76ed6585c662264e99fe41f3354bc29..5122f961f48cc58b573359de979fdc111a59c9ad
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -22,6 +22,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
 list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
 list(APPEND MIXED_DIST_TEST_OPS test_launch)
+list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
 list(APPEND MIXED_DIST_TEST_OPS test_launch_ps)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
@@ -32,6 +33,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
@@ -39,9 +42,11 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -51,6 +56,14 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
 endif()
@@ -86,8 +99,18 @@ if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_ref_by_trainer_id_op)
 endif()
 
+
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint3)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_multiple)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_dist_basic)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs1)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs2)
+LIST(REMOVE_ITEM TEST_OPS test_hdfs3)
+LIST(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 if(APPLE OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_hdfs)
     LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
@@ -100,6 +123,8 @@ if (NOT ${WITH_GPU})
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+    LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
 elseif(${CUDNN_VERSION} VERSION_LESS 7100)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
@@ -184,16 +209,18 @@ function(py_test_modules TARGET_NAME)
   endif()
 endfunction()
 
+
 function(bash_test_modules TARGET_NAME)
     if(NOT WITH_TESTING)
         return()
     endif()
 
     set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ENVS LABELS)
+    set(oneValueArgs TIMEOUT START_BASH)
+    set(multiValueArgs DEPS ENVS LABELS)
     cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
+
     set(timeout 350)
     if(${bash_test_modules_TIMEOUT})
         set(timeout ${bash_test_modules_TIMEOUT})
@@ -204,13 +231,13 @@ function(bash_test_modules TARGET_NAME)
             COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
             TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
             WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     else()
         add_test(NAME ${TARGET_NAME}
             COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
             TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
 
@@ -225,6 +252,51 @@ function(bash_test_modules TARGET_NAME)
     endif()
 endfunction()
 
+function(parallel_bash_test_modules TARGET_NAME)
+    if(NOT WITH_TESTING)
+        return()
+    endif()
+
+    set(options SERIAL)
+    set(oneValueArgs TIMEOUT START_BASH)
+    set(multiValueArgs DEPS ENVS LABELS UnitTests)
+    cmake_parse_arguments(parallel_bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+
+    set(timeout 120)
+    if(${parallel_bash_test_modules_TIMEOUT})
+        set(timeout ${parallel_bash_test_modules_TIMEOUT})
+    endif()
+
+    list(JOIN  parallel_bash_test_modules_UnitTests " " uts_string)
+
+    if(WITH_COVERAGE)
+        add_test(NAME ${TARGET_NAME}
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+            WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    else()
+        add_test(NAME ${TARGET_NAME}
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+
+    if (parallel_bash_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+
+    if(parallel_bash_test_modules_LABELS)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout} LABELS ${parallel_bash_test_modules_LABELS})
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT ${timeout})
+    endif()
+endfunction()
+
+
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_profiler)
@@ -259,6 +331,9 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
+list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
+
+
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
@@ -271,6 +346,7 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_static)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dynamic)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset)
 endif()
 
 if(NOT WITH_GPU OR WIN32 OR APPLE)
@@ -345,23 +421,26 @@ if(WITH_DISTRIBUTE)
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
 
     #not need
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base")
 
-    # FIXME(seiriosX) will readd after PR 22957  Merged
+
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
 
+    # FIXME(seiriosX) will fix this
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
+
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
     py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
     py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
@@ -371,6 +450,8 @@ if(WITH_DISTRIBUTE)
     py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
     if(NOT APPLE)
     	   py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_graph_execution_meta_optimizer MODULES test_fleet_graph_execution_meta_optimizer ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
@@ -378,9 +459,11 @@ if(WITH_DISTRIBUTE)
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
@@ -398,15 +481,17 @@ if(WITH_DISTRIBUTE)
     if(NOT APPLE)
         if(WITH_GPU)
             # NOTE. test_launch only work in gpu collective mode
-            bash_test_modules(test_launch MODULES test_launch.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_launch START_BASH test_launch.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
             py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
         endif()
-        bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch MODULES test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+
+        bash_test_modules(test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
 
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
-            bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
+            bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
             MATH(EXPR dist_ut_port "${dist_ut_port}+50")
         endforeach(TEST_OP)
     endif(NOT APPLE)
@@ -442,6 +527,18 @@ if(NOT WIN32)
     set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 endif()
 
+if(NOT APPLE AND NOT WIN32)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
+endif()
+
 add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
 
@@ -481,4 +578,14 @@ if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_multiprocess_dataloader_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
     set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(test_multiprocess_dataloader_iterable_dataset_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
+
+# setting timeout value for old unittests
+# set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
diff --git a/python/paddle/fluid/tests/unittests/__init__.py b/python/paddle/fluid/tests/unittests/__init__.py
index b94a21a7e406b833797f8f521c62a2351c2bc30a..193b91cdaa13293ca920a8b79826bb71657c5d56 100644
--- a/python/paddle/fluid/tests/unittests/__init__.py
+++ b/python/paddle/fluid/tests/unittests/__init__.py
@@ -10,4 +10,15 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License.p
+
+# Note: On Windows, import form subdirectories such as dirA()->dirB(), current directory 
+# will still be dirA(), But is should be dirB(). So it will ModulNotFoundError
+# please refer to https://stackoverflow.com/questions/8953844/import-module-from-subfolder
+
+import os
+if os.name == 'nt':
+    import sys
+    dirname, filename = os.path.split(os.path.abspath(__file__))
+    sys.path.insert(0, dirname)
+    print(sys.path)
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..529ff4ec45d1fdc6d1d8e765e38cff53d36aade7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+BATCH_NUM = 4
+BATCH_SIZE = 1
+
+#IMAGE_SIZE = 128
+CLASS_NUM = 2
+
+USE_GPU = False  # whether use GPU to run model
+places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
+
+logger = None
+
+
+def get_logger():
+    global logger
+    logger = acp._get_logger(20)
+    return logger
+
+
+def get_random_images_and_labels(image_shape, label_shape):
+    image = np.random.random(size=image_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('int64')
+    return image, label
+
+
+def sample_list_generator_creator():
+    def __reader__():
+        for _ in range(BATCH_NUM):
+            sample_list = []
+            for _ in range(BATCH_SIZE):
+                image, label = get_random_images_and_labels([4, 4], [1])
+                sample_list.append([image, label])
+
+            yield sample_list
+
+    return __reader__
+
+
+class AutoCheckpointBase(unittest.TestCase):
+    def _init_env(self,
+                  exe,
+                  main_prog,
+                  startup_prog,
+                  minimize=True,
+                  iterable=True):
+        def simple_net():
+            image = fluid.data(name='image', shape=[-1, 4, 4], dtype='float32')
+            label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
+
+            fc_tmp = fluid.layers.fc(image, size=CLASS_NUM)
+            cross_entropy = fluid.layers.softmax_with_cross_entropy(fc_tmp,
+                                                                    label)
+            loss = fluid.layers.reduce_mean(cross_entropy)
+            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            if minimize:
+                sgd.minimize(loss)
+            return sgd, loss, image, label
+
+        with program_guard(main_prog, startup_prog):
+            sgd, loss, image, label = simple_net()
+
+            if minimize:
+                compiled = fluid.CompiledProgram(main_prog).with_data_parallel(
+                    loss_name=loss.name)
+            else:
+                compiled = None
+            loader = fluid.io.DataLoader.from_generator(
+                feed_list=[image, label],
+                capacity=64,
+                use_double_buffer=True,
+                iterable=iterable)
+
+            loader.set_sample_list_generator(sample_list_generator_creator(),
+                                             places[0])
+
+        if minimize:
+            exe.run(startup_prog)
+
+        return compiled, loader, sgd, loss, image, label
+
+    def _generate(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        exe = fluid.Executor(places[0])
+
+        return exe, main_prog, startup_prog
+
+    def _reset_generator(self):
+        unique_name.generator = fluid.unique_name.UniqueNameGenerator()
+        acp.generator = fluid.unique_name.UniqueNameGenerator()
+        acp.g_acp_type = None
+        acp.g_checker = acp.AutoCheckpointChecker()
+        acp.g_program_attr = {}
+
+    def _clear_envs(self):
+        os.environ.pop("PADDLE_RUNNING_ENV", None)
+
+    def _readd_envs(self):
+        os.environ["PADDLE_RUNNING_ENV"] = "PADDLE_EDL_AUTO_CHECKPOINT"
diff --git a/python/paddle/fluid/tests/unittests/c_comm_init_op.py b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db77477cca62d10ff6692013a64a8d2ce5a38ec1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+
+
+class TestCCommInitOp(unittest.TestCase):
+    def setUp(self):
+        self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
+        self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        self.nranks = len(self.endpoints)
+        self.rank = self.endpoints.index(self.current_endpoint)
+        self.gpu_id = int(os.getenv("FLAGS_selected_gpus"))
+        self.place = fluid.CUDAPlace(self.gpu_id)
+        self.exe = fluid.Executor(self.place)
+        self.endpoints.remove(self.current_endpoint)
+        self.other_endpoints = self.endpoints
+        if self.rank == 0:
+            wait_server_ready(self.other_endpoints)
+
+    def test_specifying_devices(self):
+        program = fluid.Program()
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=fluid.unique_name.generate('nccl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': self.rank,
+                'endpoint': self.current_endpoint,
+                'other_endpoints': self.other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': self.nranks,
+                'rank': self.rank,
+                'ring_id': 0,
+                'device_id': self.gpu_id
+            })
+        self.exe.run(program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf4ca07ae9b57e083137945f58aaabb571e20ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tensor_list = []
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return tensor_list
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllgatherAPI, "allgather")
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea429ae5e3e622ee1b584796ef87edc1d4c8d72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b3c27126d926ac7175f6045f385adf4d530b44
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            paddle.distributed.barrier()
+            return []
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBarrierAPI, "barrier")
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a879a027b50688234c8efb8468e6eac660d8a145
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e89b1cb3ee8550d3dbb4e1a055f092e57126c7f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.reduce(tindata, dst=0)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduceAPI, "reduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..da61284344b58d44c5ba02af5ed42c553f857c94
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveReduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_reduce_sum",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'root_id': rootid},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduce, "reduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6904286234364e7ae84a5c21b9826885f99dc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveReduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_reduce_sum",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'use_calc_stream': True,
+                    'root_id': rootid
+                },
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveReduce, "reduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68929ad3b36d5a0bf145a93b30172f0422dc9f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata",
+                shape=[10, 1000],
+                dtype='float64',
+                append_batch_size=False)
+            toutdata = layers.fill_constant(
+                shape=[5, 1000], dtype='float64', value=1.0)
+            tensor_list = None
+            if rank == 1:
+                tensor_list = paddle.split(tindata, 2, axis=0)
+            paddle.distributed.scatter(toutdata, tensor_list, src=1)
+            return [toutdata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveScatterAPI, "scatter")
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..efe5e17bcce1ecddf859edbb3543876fe5fc9f89
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+
+class TestCollectiveScatter(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        rootid = 1
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_scatter",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id,
+                       'root': rootid,
+                       'nranks': 2},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveScatter, "scatter", 0)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 6bf95b9d6715bfade20069eec130a676d7edeb55..73b546b95cfeb8032c6e99eabe24c883d1f5f66c 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,6 +28,7 @@ import numpy as np
 
 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from paddle.distributed.fleet.base.util_factory import fleet_util
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
@@ -161,28 +162,28 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
         fleet.init_worker()
-        exe.run(fleet.startup_program)
-
+        exe.run(fluid.default_startup_program())
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
 
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fleet.main_program).with_data_parallel(
-                loss_name=self.avg_cost.name,
-                build_strategy=self.strategy.get_build_strategy(),
-                exec_strategy=self.strategy.get_execute_strategy())
-
         for epoch_id in range(1):
             self.reader.start()
             try:
                 pass_start = time.time()
                 while True:
-                    loss_val = exe.run(program=compiled_prog,
+                    loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                  loss_val))
+                    # TODO(randomly fail)
+                    #   reduce_output = fleet_util.all_reduce(
+                    #       np.array(loss_val), mode="sum")
+                    #   loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    #   loss_val = float(reduce_output) / len(loss_all_trainer)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
                 self.reader.reset()
@@ -201,7 +202,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         exe = fluid.Executor(fluid.CPUPlace())
 
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         thread_num = 2
         batch_size = 128
@@ -210,7 +211,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             filelist.append(train_file_path)
 
         # config dataset
-        dataset = fluid.DatasetFactory().create_dataset()
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
         dataset.set_batch_size(batch_size)
         dataset.set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
@@ -223,7 +224,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             pass_start = time.time()
             dataset.set_filelist(filelist)
             exe.train_from_dataset(
-                program=fleet.main_program,
+                program=fluid.default_main_program(),
                 dataset=dataset,
                 fetch_list=[self.avg_cost],
                 fetch_info=["cost"],
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d0fa447daf3e3a502e7d77491045f92695496c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Distribute CTR model for test fleet api
+"""
+
+from __future__ import print_function
+
+import shutil
+import tempfile
+import time
+
+import paddle
+import paddle.fluid as fluid
+import os
+import numpy as np
+
+import ctr_dataset_reader
+from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
+from paddle.distributed.fleet.base.util_factory import fleet_util
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistGpuPsCTR2x2(TestDistCTR2x2):
+    """
+    For test CTR model, using Fleet api & PS-GPU
+    """
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+        exe = fluid.Executor(place)
+        fleet.init_worker()
+        exe.run(fleet.startup_program)
+
+        batch_size = 4
+        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    loss_val = exe.run(program=fleet.main_program,
+                                       fetch_list=[self.avg_cost.name])
+                    loss_val = np.mean(loss_val)
+                    reduce_output = fleet_util.all_reduce(
+                        np.array(loss_val), mode="sum")
+                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_val = float(reduce_output) / len(loss_all_trainer)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+
+        model_dir = tempfile.mkdtemp()
+        fleet.save_inference_model(
+            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
+        self.check_model_right(model_dir)
+        if fleet.is_first_worker():
+            fleet.save_persistables(executor=exe, dirname=model_dir)
+        shutil.rmtree(model_dir)
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
+        )
+
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+        exe = fluid.Executor(place)
+
+        fleet.init_worker()
+        exe.run(fleet.startup_program)
+
+        thread_num = 2
+        batch_size = 128
+        filelist = []
+        for _ in range(thread_num):
+            filelist.append(train_file_path)
+
+        # config dataset
+        dataset = paddle.fleet.DatasetFactory().create_dataset()
+        dataset.set_batch_size(batch_size)
+        dataset.set_use_var(self.feeds)
+        pipe_command = 'python ctr_dataset_reader.py'
+        dataset.set_pipe_command(pipe_command)
+
+        dataset.set_filelist(filelist)
+        dataset.set_thread(thread_num)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            dataset.set_filelist(filelist)
+            exe.train_from_dataset(
+                program=fleet.main_program,
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            if fleet.is_first_worker():
+                fleet.save_persistables(executor=exe, dirname=model_dir)
+            shutil.rmtree(model_dir)
+
+        fleet.stop_worker()
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistGpuPsCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e811408291a0a3f784ff2b744ce616d6bfbe767
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import time
+import numpy as np
+import logging
+import paddle
+import paddle.fluid as fluid
+#import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+#role = role_maker.GeneralRoleMaker(
+#init_timeout_seconds=100,
+#run_timeout_seconds=100,
+#http_ip_port="127.0.0.1:26001")
+
+#role = role_maker.PaddleCloudRoleMaker(http_ip_port="127.0.0.1:26001")
+
+#role = role_maker.GeneralRoleMaker(path="./tmp4")
+logger.info("Begin")
+res = [0, 0]
+
+logger.info(res)
+
+role = role_maker.PaddleCloudRoleMaker(path="./tmp4")
+
+fleet.init(role)
+print("init wancheng")  #
+#if fleet.is_worker():
+#    import time
+#    time.sleep(3)
+
+a = [5]
+b = [2]
+res = [0]
+if fleet.worker_index() == 0:
+    role._all_reduce(role._node_type_comm, a)
+elif fleet.worker_index() == 1:
+    role._all_reduce(role._node_type_comm, b)
+
+#logger.info(res)
+#print("res ", res)
+
+#role._barrier_all()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index c69e1247a9bb8f97350ae79bcc6df1bc645204ea..77697896b4d556da8a98c17e281b3d7a6999fd64 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -152,24 +152,18 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         batch_size = 4
 
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
 
-        compiled_prog = fluid.compiler.CompiledProgram(
-            fleet.main_program).with_data_parallel(
-                loss_name=self.avg_cost.name,
-                build_strategy=self.strategy.get_build_strategy(),
-                exec_strategy=self.strategy.get_execute_strategy())
-
         for epoch_id in range(1):
             self.reader.start()
             try:
                 while True:
-                    loss_val = exe.run(program=compiled_prog,
+                    loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
                     print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bff108dd43665df0fc1c8b166a935946b4fbc7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+class PredictorTools(object):
+    '''
+    Paddle-Inference predictor
+    '''
+
+    def __init__(self, model_path, params_file, feeds_var):
+        '''
+        __init__
+        '''
+        self.model_path = model_path
+        self.params_file = params_file
+
+        self.feeds_var = feeds_var
+
+    def _load_model_and_set_config(self):
+        '''
+        load model from file and set analysis config 
+        '''
+        if os.path.exists(os.path.join(self.model_path, self.params_file)):
+            config = AnalysisConfig(
+                os.path.join(self.model_path, "__model__"),
+                os.path.join(self.model_path, self.params_file))
+        else:
+            config = AnalysisConfig(os.path.join(self.model_path))
+
+        if fluid.is_compiled_with_cuda():
+            config.enable_use_gpu(100, 0)
+        else:
+            config.disable_gpu()
+        config.switch_specify_input_names(True)
+        config.switch_use_feed_fetch_ops(False)
+        config.enable_memory_optim()
+        config.disable_glog_info()
+        config.switch_ir_optim(True)
+
+        return config
+
+    def _get_analysis_outputs(self, config):
+        '''
+        Return outputs of paddle inference
+        Args:
+            config (AnalysisConfig): predictor configs
+        Returns:
+            outs (numpy array): forward netwrok prediction outputs
+        '''
+        predictor = create_paddle_predictor(config)
+        tensor_shapes = predictor.get_input_tensor_shape()
+        names = predictor.get_input_names()
+        for i, name in enumerate(names):
+            #assert name in self.feeds_var, '{} not in feeded dict'.format(name)
+            shape = tensor_shapes[name]
+            tensor = predictor.get_input_tensor(name)
+            feed_data = self.feeds_var[i]
+            tensor.copy_from_cpu(np.array(feed_data))
+            if type(feed_data) == fluid.LoDTensor:
+                tensor.set_lod(feed_data.lod())
+
+        # ensure no diff in multiple repeat times
+        repeat_time = 10
+        for i in range(repeat_time):
+            predictor.zero_copy_run()
+
+        output_names = predictor.get_output_names()
+        outs = [
+            predictor.get_output_tensor(out_name).copy_to_cpu()
+            for out_name in output_names
+        ]
+
+        return outs
+
+    def __call__(self):
+        '''
+        __call__
+        '''
+        config = self._load_model_and_set_config()
+        outputs = self._get_analysis_outputs(config)
+
+        return outputs
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
index 68e6f328726f5b2664d31ac46394fa451631388c..d4646833ea2bd4e3c2c07a1962e4e866bdfe776e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
@@ -17,12 +17,13 @@ from __future__ import print_function
 import numpy
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 
 
-@declarative
+@paddle.jit.to_static
 def dyfunc_assert_variable(x):
     x_v = fluid.dygraph.to_variable(x)
     assert x_v
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index 27777a62799e104ac8a08fd67df8bdbe2a256724..f105dd5e94744ecca96ee0282432ff4946ab5e04 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -23,6 +23,8 @@ from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
 
+from predictor_utils import PredictorTools
+
 program_translator = ProgramTranslator()
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
 )
@@ -152,6 +154,12 @@ def predict_dygraph_jit(data):
         return pred_res
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, data)
+    out = output()
+    return out
+
+
 class TestBert(unittest.TestCase):
     def setUp(self):
         self.bert_config = get_bert_config()
@@ -178,9 +186,11 @@ class TestBert(unittest.TestCase):
             dygraph_pred_res = predict_dygraph(self.bert_config, data)
             static_pred_res = predict_static(data)
             dygraph_jit_pred_res = predict_dygraph_jit(data)
+            predictor_pred_res = predict_analysis_inference(data)
 
-            for dy_res, st_res, dy_jit_res in zip(
-                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
+            for dy_res, st_res, dy_jit_res, predictor_res in zip(
+                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
+                    predictor_pred_res):
                 self.assertTrue(
                     np.allclose(st_res, dy_res),
                     "dygraph_res: {},\n static_res: {}".format(
@@ -191,6 +201,11 @@ class TestBert(unittest.TestCase):
                     "dygraph_jit_res: {},\n static_res: {}".format(
                         dy_jit_res[~np.isclose(st_res, dy_jit_res)],
                         st_res[~np.isclose(st_res, dy_jit_res)]))
+                self.assertTrue(
+                    np.allclose(st_res, predictor_res),
+                    "dygraph_jit_res: {},\n static_res: {}".format(
+                        predictor_res[~np.isclose(st_res, predictor_res)],
+                        st_res[~np.isclose(st_res, predictor_res)]))
             break
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index c01705dbe9ba655d9cfb538dfdde0474ffa30855..dd58a49bb55c24a5e126965bff415d9a54cff5ad 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -15,13 +15,15 @@
 import math
 import numpy as np
 import unittest
-
+from paddle.jit import to_static
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
 from paddle.fluid.dygraph import to_variable
-from paddle.fluid.dygraph import declarative, ProgramTranslator
+from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 DATATYPE = 'float32'
 program_translator = ProgramTranslator()
@@ -240,7 +242,7 @@ class BMN(fluid.dygraph.Layer):
             param_attr=ParamAttr(name="PEM_2d4_w"),
             bias_attr=ParamAttr(name="PEM_2d4_b"))
 
-    @declarative
+    @to_static
     def forward(self, x):
         # Base Module
         x = self.b_conv1(x)
@@ -693,9 +695,11 @@ class TestTrain(unittest.TestCase):
             static_pred_res = self.predict_static(video_data)
             dygraph_pred_res = self.predict_dygraph(video_data)
             dygraph_jit_pred_res = self.predict_dygraph_jit(video_data)
+            predictor_pred_res = self.predict_analysis_inference(video_data)
 
-            for dy_res, st_res, dy_jit_res in zip(
-                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
+            for dy_res, st_res, dy_jit_res, predictor_res in zip(
+                    dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
+                    predictor_pred_res):
                 self.assertTrue(
                     np.allclose(st_res, dy_res),
                     "dygraph_res: {},\n static_res: {}".format(
@@ -706,6 +710,11 @@ class TestTrain(unittest.TestCase):
                     "dygraph_jit_res: {},\n static_res: {}".format(
                         dy_jit_res[~np.isclose(st_res, dy_jit_res)],
                         st_res[~np.isclose(st_res, dy_jit_res)]))
+                self.assertTrue(
+                    np.allclose(st_res, predictor_res),
+                    "dygraph_jit_res: {},\n static_res: {}".format(
+                        predictor_res[~np.isclose(st_res, predictor_res)],
+                        st_res[~np.isclose(st_res, predictor_res)]))
             break
 
     def predict_dygraph(self, data):
@@ -749,6 +758,11 @@ class TestTrain(unittest.TestCase):
 
             return pred_res
 
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.args.infer_dir, VARIABLE_FILENAME, [data])
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index c8051b3f24170693259b17456bcc124221117900..af1e44ffe212343c02a9bed8a8cacb0a966451aa 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -19,7 +19,7 @@ import numpy as np
 import unittest
 
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.jit import declarative
+from paddle.jit import to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 PLACE = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
@@ -76,7 +76,7 @@ class MainNetWithDict(fluid.dygraph.Layer):
         self.output_size = output_size
         self.sub_net = SubNetWithDict(hidden_size, output_size)
 
-    @declarative
+    @to_static
     def forward(self, input, max_len=4):
         input = fluid.dygraph.to_variable(input)
         cache = {
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
new file mode 100644
index 0000000000000000000000000000000000000000..586020d434519b12c6fff4cbba812a013cf45c3d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import inspect
+import unittest
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.core import EnforceNotMet
+from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
+from paddle.fluid.dygraph.jit import declarative
+
+
+def inner_func():
+    fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")
+    return
+
+
+@declarative
+def func_error_in_compile_time(x):
+    x = fluid.dygraph.to_variable(x)
+    inner_func()
+    if fluid.layers.mean(x) < 0:
+        x_v = x - 1
+    else:
+        x_v = x + 1
+    return x_v
+
+
+@declarative
+def func_error_in_compile_time_2(x):
+    x = fluid.dygraph.to_variable(x)
+    x = fluid.layers.reshape(x, shape=[1, 2])
+    return x
+
+
+@declarative
+def func_error_in_runtime(x, iter_num=3):
+    x = fluid.dygraph.to_variable(x)
+    two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
+    x = fluid.layers.reshape(x, shape=[1, two])
+    return x
+
+
+class TestErrorInCompileTime(unittest.TestCase):
+    def setUp(self):
+        self.set_func()
+        self.set_input()
+        self.set_exception_type()
+
+    def set_func(self):
+        self.func = func_error_in_compile_time
+
+    def set_exception_type(self):
+        self.exception_type = TypeError
+
+    def set_input(self):
+        self.input = np.ones([3, 2])
+
+    def set_message(self):
+        self.expected_message = \
+            ['File "{}", line 36, in func_error_in_compile_time'.format(self.filepath),
+            'inner_func()',
+            'File "{}", line 29, in inner_func'.format(self.filepath),
+            'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
+            ]
+
+    def _test_create_message(self, error_data):
+        self.filepath = inspect.getfile(unwrap(self.func))
+        self.set_message()
+        error_message = error_data.create_message()
+
+        self.assertIn('In user code:', error_message)
+        for m in self.expected_message:
+            self.assertIn(m, error_message)
+
+    def test(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(self.exception_type) as cm:
+                self.func(self.input)
+            exception = cm.exception
+            error_data = getattr(exception, ERROR_DATA)
+            self.assertIsInstance(error_data, ErrorData)
+            self._test_create_message(error_data)
+
+
+class TestErrorInCompileTime2(TestErrorInCompileTime):
+    def set_func(self):
+        self.func = func_error_in_compile_time_2
+
+    def set_exception_type(self):
+        self.exception_type = EnforceNotMet
+
+    def set_message(self):
+
+        self.expected_message = \
+            [
+             'File "{}", line 47, in func_error_in_compile_time_2'.format(self.filepath),
+             'x = fluid.layers.reshape(x, shape=[1, 2])'
+             ]
+
+
+class TestErrorInRuntime(TestErrorInCompileTime):
+    def set_func(self):
+        self.func = func_error_in_runtime
+
+    def set_exception_type(self):
+        self.exception_type = EnforceNotMet
+
+    def set_message(self):
+        self.expected_message = \
+            [
+                'File "{}", line 55, in func_error_in_runtime'.format(self.filepath),
+                'x = fluid.layers.reshape(x, shape=[1, two])'
+            ]
+
+    def _test_create_message(self, error_data):
+        self.filepath = inspect.getfile(unwrap(self.func))
+        self.set_message()
+
+        with self.assertRaises(ValueError):
+            error_data.create_message()
+
+        error_data.in_runtime = False
+        error_message = error_data.create_message()
+
+        self.assertIn('In user code:', error_message)
+        for m in self.expected_message:
+            self.assertIn(m, error_message)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index fdf6daf6263e2bb7cf8ef2c3ad1373fb079f0037..0e2bac9fa5b5c9e47ce8a08b0187531a3b83dcee 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -27,6 +27,8 @@ from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 
 program_translator = ProgramTranslator()
@@ -536,6 +538,7 @@ class TestLACModel(unittest.TestCase):
             dy_pre = self.predict_dygraph(batch)
             st_pre = self.predict_static(batch)
             dy_jit_pre = self.predict_dygraph_jit(batch)
+            predictor_pre = self.predict_analysis_inference(batch)
             self.assertTrue(
                 np.allclose(dy_pre, st_pre),
                 msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -543,6 +546,10 @@ class TestLACModel(unittest.TestCase):
                 np.allclose(dy_jit_pre, st_pre),
                 msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
                                                                st_pre))
+            self.assertTrue(
+                np.allclose(predictor_pre, st_pre),
+                msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                                  st_pre))
 
     def predict_dygraph(self, batch):
         words, targets, length = batch
@@ -591,6 +598,14 @@ class TestLACModel(unittest.TestCase):
 
             return pred_res.numpy()
 
+    def predict_analysis_inference(self, batch):
+        words, targets, length = batch
+
+        output = PredictorTools(self.args.model_save_dir, VARIABLE_FILENAME,
+                                [words, length])
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index b8aa0379638fadd19b4956a56c1a3e4811558535..1ef3bd1bf150056816283c83fa3ff6af1e589732 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -25,10 +25,11 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 
 
@@ -100,7 +101,7 @@ class MNIST(fluid.dygraph.Layer):
                     loc=0.0, scale=scale)),
             act="softmax")
 
-    @declarative
+    @paddle.jit.to_static
     def forward(self, inputs, label=None):
         x = self.inference(inputs)
         if label is not None:
@@ -132,7 +133,7 @@ class TestMNIST(unittest.TestCase):
             drop_last=True)
 
 
-class TestMNISTWithDeclarative(TestMNIST):
+class TestMNISTWithToStatic(TestMNIST):
     """
     Tests model if doesn't change the layers while decorated
     by `dygraph_to_static_output`. In this case, everything should
@@ -145,7 +146,7 @@ class TestMNISTWithDeclarative(TestMNIST):
     def train_dygraph(self):
         return self.train(to_static=False)
 
-    def test_mnist_declarative(self):
+    def test_mnist_to_static(self):
         dygraph_loss = self.train_dygraph()
         static_loss = self.train_static()
         self.assertTrue(
@@ -220,6 +221,10 @@ class TestMNISTWithDeclarative(TestMNIST):
             dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
                 infer_model_path, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
+            # load in Paddle-Inference
+            predictor_infer_out = self.predictor_load_and_run_inference_analysis(
+                infer_model_path, inputs)
+            self.assertTrue(np.allclose(gt_out.numpy(), predictor_infer_out))
 
     @switch_to_static_graph
     def jit_load_and_run_inference_static(self, model_path, inputs):
@@ -241,6 +246,11 @@ class TestMNISTWithDeclarative(TestMNIST):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
+    def predictor_load_and_run_inference_analysis(self, model_path, inputs):
+        output = PredictorTools(model_path, VARIABLE_FILENAME, inputs)
+        out = output()
+        return out
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index ef0f6e7f0831eea8d2f694413c5231ecea292ff4..5ec3de5871dd6787c06938a8b771f7d14e54e1e0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -23,6 +23,8 @@ from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
 import unittest
 
+from predictor_utils import PredictorTools
+
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
 #        some algorithm results are non-deterministic, like convolution algorithms.
@@ -550,6 +552,12 @@ def predict_dygraph_jit(args, data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(args, data):
+    output = PredictorTools(args.model_save_path, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestMobileNet(unittest.TestCase):
     def setUp(self):
         self.args = Args()
@@ -577,12 +585,18 @@ class TestMobileNet(unittest.TestCase):
         dy_pre = predict_dygraph(self.args, image)
         st_pre = predict_static(self.args, image)
         dy_jit_pre = predict_dygraph_jit(self.args, image)
+        predictor_pre = predict_analysis_inference(self.args, image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(
+                predictor_pre, st_pre, atol=1e-5),
+            msg="inference_pred_res:\n {}\n, st_pre: \n{}.".format(
+                predictor_pre, st_pre))
 
     def test_mobile_net(self):
         # MobileNet-V1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index 631655ec74428344376ea5b814ea443a91c49fc0..b03777b6ebc7f3cceb73cd32e6fdfea11755320e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -90,7 +90,8 @@ class TestOriginInfo(unittest.TestCase):
 
         # step3
         self.static_func, _ = ast_to_func(transformed_ast, self.dygraph_func)
-        info_map = create_origin_info_map(dygraph_ast, self.static_func)
+        info_map = create_and_update_origin_info_map(dygraph_ast,
+                                                     self.static_func)
 
         return info_map
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 90d210eba1e0fb1eeaf5eb0c8cbc0ff46c35328f..46eb2b42e9265ac7f6340ee0be3a7127e5246eef 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -26,12 +26,15 @@ from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 IMAGENET1000 = 1281167
 base_lr = 0.001
 momentum_rate = 0.9
 l2_decay = 1e-4
-batch_size = 8
+# NOTE: Reduce batch_size from 8 to 2 to avoid unittest timeout.
+batch_size = 2
 epoch_num = 1
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
@@ -306,6 +309,12 @@ def predict_dygraph_jit(data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestResnet(unittest.TestCase):
     def train(self, to_static):
         program_translator.enable(to_static)
@@ -316,12 +325,17 @@ class TestResnet(unittest.TestCase):
         dy_pre = predict_dygraph(image)
         st_pre = predict_static(image)
         dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
 
     def test_resnet(self):
         static_loss = self.train(to_static=True)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index c34e9478c8eab38c429c01db5fae460eeac6a4bd..30cba78fec19c169966e85ff43e79c3a00889616 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -26,6 +26,8 @@ from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 
+from predictor_utils import PredictorTools
+
 SEED = 2020
 np.random.seed(SEED)
 
@@ -434,6 +436,12 @@ def predict_dygraph_jit(data):
         return pred_res.numpy()
 
 
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
 class TestSeResnet(unittest.TestCase):
     def setUp(self):
         self.train_reader = paddle.batch(
@@ -447,12 +455,17 @@ class TestSeResnet(unittest.TestCase):
         dy_pre = predict_dygraph(image)
         st_pre = predict_static(image)
         dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
         self.assertTrue(
             np.allclose(dy_jit_pre, st_pre),
             msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
 
     def test_check_result(self):
         pred_1, loss_1, acc1_1, acc5_1 = train(
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
similarity index 55%
rename from python/paddle/fluid/tests/unittests/test_hdfs.py
rename to python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 9826542cee3732a48e1c6b6959afb74063bb09d7..6a752bc3053d7d0672bd0002250252c3bbbfa1e1 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -15,18 +15,16 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
 
-class FSTest(unittest.TestCase):
+class FSTestBase(unittest.TestCase):
     def _test_dirs(self, fs):
         dir_path = os.path.abspath("./test_dir")
         fs.delete(dir_path)
@@ -59,6 +57,12 @@ class FSTest(unittest.TestCase):
         fs.delete(dir_path)
         self.assertTrue(not fs.is_exist(dir_path))
 
+        fs.mkdirs(dir_path)
+        fs.mkdirs(new_dir_path)
+        fs.mv(dir_path, new_dir_path, overwrite=True)
+        self.assertTrue(not fs.is_exist(dir_path))
+        self.assertTrue(fs.is_exist(new_dir_path))
+
     def _test_touch_file(self, fs):
         file_path = os.path.abspath("./test_file")
 
@@ -106,6 +110,35 @@ class FSTest(unittest.TestCase):
         fs.delete(dst_file)
         fs.delete(src_file)
 
+    def _test_try_download(self, fs):
+        src_file = os.path.abspath("./test_try_download.src")
+        dst_file = os.path.abspath("./test_try_download.dst")
+
+        fs.delete(dst_file)
+        fs.delete(src_file)
+
+        try:
+            fs._try_download(src_file, dst_file)
+            self.assertFalse(True)
+        except Exception as e:
+            pass
+
+        fs.delete(dst_file)
+        fs.delete(src_file)
+
+    def _test_try_upload(self, fs):
+        src_file = os.path.abspath("./test_try_upload.src")
+        dst_file = os.path.abspath("./test_try_uolpad.dst")
+
+        try:
+            fs._try_upload(src_file, dst_file)
+            self.assertFalse(True)
+        except Exception as e:
+            pass
+
+        fs.delete(dst_file)
+        fs.delete(src_file)
+
     def _test_download(self, fs):
         src_file = os.path.abspath("./test_download.src")
         dst_file = os.path.abspath("./test_download.dst")
@@ -140,85 +173,46 @@ class FSTest(unittest.TestCase):
         fs.mkdirs(dir_name)
         fs.mkdirs(dir_name)
 
-    def test_exists(self):
-        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
-        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
-        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
-        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
-        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs.py"))
-        self.assertTrue(dirs == [])
-        self.assertTrue(len(files) == 1)
-        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
-
-    def test_hdfs(self):
-        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
-        self._test_dirs(fs)
-        self._test_upload(fs)
-
-        self._test_download(fs)
-        self._test_mkdirs(fs)
-        self._test_list_dir(fs)
-
-    def test_local(self):
-        fs = LocalFS()
-        self._test_dirs(fs)
-        self._test_touch_file(fs)
-        self._test_mkdirs(fs)
-        self._test_list_dir(fs)
-
-    def test_timeout(self):
+    def _test_rm(self, fs):
+        dir_name = "./test_rm_no_exist.flag"
+        fs.delete(dir_name)
+        try:
+            fs._rmr(dir_name)
+            self.assertFalse(True)
+        except Exception as e:
+            pass
+
+        try:
+            fs._rm(dir_name)
+            self.assertFalse(True)
+        except Exception as e:
+            pass
+
+    def _test_list_dir(self, fs):
         fs = HDFSClient(
             "/usr/local/hadoop-2.7.7/",
             None,
-            time_out=6 * 1000,
-            sleep_inter=2000)
-        src = "hdfs_test_timeout"
-        dst = "new_hdfs_test_timeout"
-        fs.delete(dst)
-        fs.mkdirs(src)
-        fs.mkdirs(dst)
-        fs.mkdirs(dst + "/" + src)
-        output = ""
+            time_out=15 * 1000,
+            sleep_inter=100)
+        fs.ls_dir("test_not_exists")
+
+    def _test_touch(self, fs):
+        path = "./touch.flag"
+        fs.touch(path, exist_ok=True)
+        try:
+            fs.touch("./touch.flag", exist_ok=False)
+            self.assertFalse(0, "can't reach here")
+        except FSFileExistsError as e:
+            pass
+
         try:
-            fs.mv(src, dst, test_exists=False)
-            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
-                                                                        output))
-        except FSTimeOut as e:
-            print("execute mv {} to {} timeout".format(src, dst))
-
-        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
-        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
-        self.assertNotEqual(ret, 0)
-        print("second mv ret:{} output:{}".format(ret, output))
-
-    def test_is_dir(self):
-        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
-        self.assertFalse(fs.is_dir("./test_hdfs.py"))
-        s = """
-java.io.IOException: Input/output error
- responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
-	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
-	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
-	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
-	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
-	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
-	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
-	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
-	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
-	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
-	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
-        """
-
-        print("split lines:", s.splitlines())
-        self.assertTrue(fs._test_match(s.splitlines()) != None)
-
-    def test_config(self):
-        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
-        fs = HDFSClient("/usr/local/hadoop-2.7.7/", config, time_out=15 * 1000)
+            fs._touchz("./touch.flag")
+            self.assertFalse(True, "can't reach here")
+        except Exception as e:
+            pass
 
-    def _test_list_dir(self, fs):
-        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
-        fs.ls_dir("test_not_exists")
+        self.assertFalse(fs.is_dir(path))
+        fs.delete(path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
index 7edca281fff9df02436b2cc1af5409db0ea1981d..46d574dad0d0ae1f72617c6aaf3369b16195f76b 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
@@ -77,12 +77,13 @@ class FusionGroupPassTest(PassTest):
             self.check_output_with_place(fluid.CUDAPlace(0))
 
 
-class FusionGroupPassTest1(FusionGroupPassTest):
+class FusionGroupPassComplicatedTest(FusionGroupPassTest):
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 5)
+            self.feed_vars = self._prepare_feed_vars([32, 64], dtype, 5)
 
-            tmp_0 = layers.assign(self.feed_vars[0])
+            one = layers.fill_constant(shape=[1], dtype=dtype, value=1.0)
+            tmp_0 = one * self.feed_vars[0]
             # subgraph with 9 op nodes
             tmp_1 = tmp_0 * layers.sigmoid(self.feed_vars[1]) + layers.sigmoid(
                 self.feed_vars[2]) * layers.tanh(self.feed_vars[3])
@@ -94,7 +95,7 @@ class FusionGroupPassTest1(FusionGroupPassTest):
         self.fetch_list = [tmp_2, self.grad(tmp_0)]
 
 
-class FusionGroupPassTest2(FusionGroupPassTest):
+class FusionGroupPassInplaceTest(FusionGroupPassTest):
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3)
@@ -103,15 +104,13 @@ class FusionGroupPassTest2(FusionGroupPassTest):
                     name="data3", shape=[128, 32], dtype=dtype))
 
             # subgraph with 3 op node
-            tmp_0 = self.feed_vars[0] + self.feed_vars[1]
-            tmp_1 = layers.relu(self.feed_vars[2] * tmp_0)
-            # subgraph with 2 op nodes
-            tmp_2 = layers.relu(layers.sigmoid(self.feed_vars[3]))
-            tmp_3 = layers.mul(tmp_1, tmp_2)
+            tmp_0 = self.feed_vars[0] - self.feed_vars[1]
+            tmp_1 = tmp_0 * self.feed_vars[2]
+            tmp_2 = layers.assign(tmp_1, output=tmp_0)
+            tmp_3 = layers.mul(tmp_2, self.feed_vars[3])
 
-        self.append_gradients(tmp_3)
-        self.num_fused_ops = 2
-        self.fetch_list = [tmp_3, self.grad(tmp_1)]
+        self.num_fused_ops = 1
+        self.fetch_list = [tmp_3]
 
 
 class FusionGroupPassTestFP64(FusionGroupPassTest):
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 64fee35710ae1b8690ec41b247ceb55e180b13c9..ecfe39b80e9051d332bb8fd2a05de2fa53770e46 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 from multiprocessing import Pool, Process
 import os
+import socket
+from contextlib import closing
 
 
 def launch_func(func, env_dict):
@@ -20,3 +22,16 @@ def launch_func(func, env_dict):
         os.environ[key] = env_dict[key]
     proc = Process(target=func)
     return proc
+
+
+def _find_free_port(port_set):
+    def __free_port():
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    while True:
+        port = __free_port()
+        if port not in port_set:
+            port_set.add(port)
+            return port
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 2404aeb72b2a77f1f817cec697b3188003e884eb..d904bdbfa96ae1df83a0cacde0822611ac55757e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -19,7 +19,7 @@ import numpy as np
 from scipy.special import expit
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestSigmoid
+from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestRelu6, TestSigmoid
 from paddle.fluid.tests.unittests.test_gelu_op import gelu
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
 
@@ -30,16 +30,17 @@ class TestMKLDNNReluDim2(TestRelu):
 
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
+
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+class TestMKLDNNRelu6Dim2(TestRelu6):
+    def setUp(self):
+        super(TestMKLDNNRelu6Dim2, self).setUp()
+        self.attrs.update({"use_mkldnn": True})
+
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNLeakyReluDim2(TestLeakyRelu):
@@ -48,16 +49,8 @@ class TestMKLDNNLeakyReluDim2(TestLeakyRelu):
 
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNGeluDim2(TestActivation):
@@ -92,16 +85,8 @@ class TestMKLDNNTanhDim2(TestTanh):
 
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNSqrtDim2(TestSqrt):
@@ -110,16 +95,8 @@ class TestMKLDNNSqrtDim2(TestSqrt):
 
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNAbsDim2(TestAbs):
@@ -127,39 +104,21 @@ class TestMKLDNNAbsDim2(TestAbs):
         super(TestMKLDNNAbsDim2, self).setUp()
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNSwishDim2(TestSwish):
     def setUp(self):
         super(TestMKLDNNSwishDim2, self).setUp()
 
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        beta = 2.3
-        out = x * expit(beta * x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "beta": beta}
+        self.attrs["use_mkldnn"] = True
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output()
+    def init_dtype(self):
+        self.dtype = np.float32
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(['X'], 'Out')
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNSigmoidDim2(TestSigmoid):
@@ -181,16 +140,8 @@ class TestMKLDNNReluDim4(TestRelu):
         self.outputs = {'Out': out}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNLeakyReluDim4(TestLeakyRelu):
@@ -206,16 +157,8 @@ class TestMKLDNNLeakyReluDim4(TestLeakyRelu):
         self.outputs = {'Out': out}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNGeluDim4(TestActivation):
@@ -254,17 +197,6 @@ class TestMKLDNNTanhDim4(TestTanh):
         self.outputs = {'Out': np.tanh(self.inputs['X'])}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
-
 
 class TestMKLDNNSqrtDim4(TestSqrt):
     def setUp(self):
@@ -276,17 +208,6 @@ class TestMKLDNNSqrtDim4(TestSqrt):
         self.outputs = {'Out': np.sqrt(self.inputs['X'])}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
-
 
 class TestMKLDNNAbsDim4(TestAbs):
     def setUp(self):
@@ -299,23 +220,15 @@ class TestMKLDNNAbsDim4(TestAbs):
         self.outputs = {'Out': np.abs(self.inputs['X'])}
         self.attrs = {"use_mkldnn": True}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNSwishDim4(TestSwish):
     def setUp(self):
         super(TestMKLDNNSwishDim4, self).setUp()
 
-        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(self.dtype)
         beta = 2.3
         out = x * expit(beta * x)
 
@@ -323,15 +236,8 @@ class TestMKLDNNSwishDim4(TestSwish):
         self.outputs = {'Out': out}
         self.attrs = {"use_mkldnn": True, "beta": beta}
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(['X'], 'Out')
+    def init_dtype(self):
+        self.dtype = np.float32
 
 
 class TestMKLDNNSigmoidDim4(TestSigmoid):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbbf7de22087d13aed1f8293d362aead5ae03b3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.test_fusion_gru_op import TestFusionGRUOp
+
+
+class TestFusionGRUMKLDNNOp(TestFusionGRUOp):
+    def set_confs(self):
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpNoInitial(TestFusionGRUOp):
+    def set_confs(self):
+        self.with_h0 = False
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpNoBias(TestFusionGRUOp):
+    def set_confs(self):
+        self.with_bias = False
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpReverse(TestFusionGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpOriginMode(TestFusionGRUOp):
+    def set_confs(self):
+        self.origin_mode = True
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpMD1(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 36
+        self.D = 8
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpMD2(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 8
+        self.D = 8
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpMD3(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 17
+        self.D = 15
+        self.use_mkldnn = True
+
+
+class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
+    def set_confs(self):
+        self.lod = [[3]]
+        self.D = 16
+        self.use_mkldnn = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
index bd8842da03e988c374586983d75cc3031c446906..11b453125dfdfb267fb9f0d4d98b93e08959116e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
@@ -105,8 +105,11 @@ class TestDnnlMatMulOpInt8NoScales(TestDnnlMatMulOp):
 
 
 class TestDnnlMatMulOpInt8(TestDnnlMatMulOp):
+    # Due to limitation in int8 matmul implementation
+    # on older platforms (BDW, SKX) we needed to reduce
+    # range from [-127, 127] to [-63, 63]
     def quantize(self, tensor):
-        scale = 127. / np.abs(np.amax(tensor))
+        scale = 63. / np.abs(np.amax(tensor))
         quantized = np.round(scale * tensor).astype("int8")
         return scale, quantized
 
diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py
index a67634adfcc0c27d5c9b470c81b880af9130462f..f999ce803a512403da14a4ea2064448aedfe242e 100644
--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -17,7 +17,7 @@ import sys
 import time
 
 
-def train():
+def train(prefix):
     selected_gpus = os.getenv("FLAGS_selected_gpus")
     trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
     worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -29,11 +29,12 @@ def train():
         .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
 
     print(name)
-    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
         f.write(name)
 
 
-def train_abort():
+def train_abort(prefix):
     selected_gpus = os.getenv("FLAGS_selected_gpus")
     trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
     worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
@@ -49,8 +50,9 @@ def train_abort():
             name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
                 .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
             print(name)
-            with open("multi_process.check_{}.log".format(trainer_id),
-                      "w") as f:
+            with open(
+                    "multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                    "w") as f:
                 f.write(name)
             raise
     else:
@@ -60,12 +62,15 @@ def train_abort():
             .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
 
         print(name)
-        with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
+        with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+                  "w") as f:
             f.write(name)
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 2 and sys.argv[1] == "abort":
-        train_abort()
+    if len(sys.argv) == 3 and sys.argv[2] == "abort":
+        prefix = sys.argv[1]
+        train_abort(prefix)
     else:
-        train()
+        prefix = sys.argv[1]
+        train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1320623f8f8422f14677a3ca629735838dc94aa8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import contextlib
+import unittest
+import numpy as np
+import six
+import pickle
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.nn import Conv2d, Pool2D, Linear, SyncBatchNorm
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+
+class TestLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(TestLayer, self).__init__()
+
+        self._conv = Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+        self._sync_batch_norm = SyncBatchNorm(num_filters)
+
+        self._conv2 = Conv2d(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+        self._sync_batch_norm2 = SyncBatchNorm(
+            num_filters,
+            weight_attr=False,
+            bias_attr=False,
+            track_running_stats=False)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._sync_batch_norm(y)
+        y = self._conv2(y)
+        y = self._sync_batch_norm2(y)
+
+        return y
+
+
+class TestSyncBatchNorm(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = TestLayer(3, 64, 7)
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.test(use_xmap=False),
+            batch_size=32,
+            drop_last=True)
+        opt = fluid.optimizer.Adam(
+            learning_rate=1e-3, parameter_list=model.parameters())
+        return model, train_reader, opt
+
+    def run_one_loop(self, model, opt, data):
+        batch_size = len(data)
+        dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                              for x in data]).astype('float32')
+        img = to_variable(dy_x_data)
+        img.stop_gradient = False
+
+        out = model(img)
+
+        out = fluid.layers.mean(out)
+
+        return out
+
+
+if __name__ == "__main__":
+    runtime_main(TestSyncBatchNorm)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ef4779f0e6f2df2f0b79f776d1e7b6c5cbf31a22..ec6b81f138321f2119a5a5aaf4b5ba9ae8f7e69b 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -34,7 +34,7 @@ class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(cls,
                                   method,
                                   use_cuda=True,
-                                  iter=50,
+                                  iter=5,
                                   batch_size=None,
                                   feed_dict=None,
                                   feed_data_reader=None,
diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9da4f035345d7f04b69a1c9483cba7022ad10baa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+unset https_proxy http_proxy
+export FLAGS_rpc_disable_reuse_port=1
+
+name=${TEST_TARGET_NAME}
+UnitTests=${UnitTests}
+TEST_TIMEOUT=${TEST_TIMEOUT}
+
+if [[ ${name}"x" == "x" ]]; then
+    echo "can't find name, please set TEST_TARGET_NAME first"
+    exit 1
+fi
+
+if [[ ${UnitTests}"x" == "x" ]]; then
+    echo "can't find UnitTests, please set TEST_TARGET_NAME first"
+    exit 1
+fi
+
+if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
+    echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
+    exit 1
+fi
+
+if [[ ${WITH_COVERAGE} == "ON" ]]; then
+    PYTHON_EXEC="python -u -m coverage run --branch -p "
+else
+    PYTHON_EXEC="python -u "
+fi
+
+run_time=$(( $TEST_TIMEOUT - 10 ))
+echo "run_time: ${run_time}"
+for ut in ${UnitTests}; do
+    echo "start ${ut}"
+    timeout -s SIGKILL ${run_time} ${PYTHON_EXEC} ./${ut}.py > ${ut}_run.log 2>&1 &
+done
+
+FAIL=0
+for job in `jobs -p`
+do
+    echo "jobs -p result:" `jobs -p`
+    echo $job
+    wait $job || let FAIL=FAIL+1
+done
+
+echo "fail_num:" $FAIL
+
+if [ "$FAIL" == "0" ];
+then
+    exit 0
+else
+    echo "FAIL! ($FAIL)"
+
+    for ut in ${UnitTests}; do
+        log=${ut}_run.log
+        echo "cat ${log}"
+        cat $log
+    done
+
+    exit 1
+fi
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 17e0cd0d5b18652f828af9936b07cb4122f87b97..45d39afc115d292fd79a3bbc4f609ad080f74602 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -36,7 +36,7 @@ remove_dropout = False
 # and Executor is different.
 remove_bn = False
 
-remove_cudnn_conv = False
+remove_cudnn_conv = True
 
 remove_dropout = True
 remove_bn = True
@@ -179,7 +179,7 @@ def batch_size(use_cuda):
 def iter(use_cuda):
     if use_cuda:
         return 10
-    return 2
+    return 1
 
 
 gpu_img, gpu_label = init_data(
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 124767a3364b078ea2c74795c03497f3dc24ba8c..ab61a5b3cfccb0e885debe9786ae91a9754e9345 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -22,7 +22,7 @@ from scipy.special import expit, erf
 import paddle
 import paddle.fluid as fluid
 import paddle.nn as nn
-import paddle.nn.functional as functional
+import paddle.nn.functional as F
 from paddle.fluid import compiler, Program, program_guard
 
 
@@ -118,7 +118,7 @@ class TestLogSigmoid(TestActivation):
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         out = np.log(1 / (1 + np.exp(-x)))
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -127,6 +127,48 @@ class TestLogSigmoid(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
+class TestLogSigmoidAPI(unittest.TestCase):
+    # test paddle.nn.LogSigmoid, paddle.nn.functional.logsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out1 = F.logsigmoid(x)
+            m = paddle.nn.LogSigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.logsigmoid(x)
+        m = paddle.nn.LogSigmoid()
+        out2 = m(x)
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.logsigmoid, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.logsigmoid, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            F.logsigmoid(x_fp16)
+
+
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "tanh"
@@ -149,6 +191,59 @@ class TestTanh(TestActivation, TestParameter):
         self.dtype = np.float32
 
 
+class TestTanhAPI(unittest.TestCase):
+    # test paddle.tanh, paddle.nn.tanh, paddle.nn.functional.tanh
+    def setUp(self):
+        self.dtype = 'float32'
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12], self.dtype)
+            out1 = F.tanh(x)
+            th = paddle.nn.Tanh()
+            out2 = th(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.tanh(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.tanh(x)
+        out2 = paddle.tanh(x)
+        th = paddle.nn.Tanh()
+        out3 = th(x)
+        out_ref = np.tanh(self.x_np)
+        for r in [out1, out2, out3]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12], self.dtype)
+            out = fluid.layers.tanh(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = np.tanh(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.tanh, 1)
+            # The input dtype must be float16, float32.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.tanh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.tanh(x_fp16)
+
+
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
         self.op_type = "atan"
@@ -327,15 +422,20 @@ class TestCoshOpError(unittest.TestCase):
             fluid.layers.cosh(x_fp16)
 
 
-class TestTanhShrink(TestActivation):
+def ref_tanhshrink(x):
+    out = x - np.tanh(x)
+    return out
+
+
+class TestTanhshrink(TestActivation):
     def setUp(self):
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
-        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
-        out = x - np.tanh(x)
+        x = np.random.uniform(10, 20, [10, 17]).astype(self.dtype)
+        out = ref_tanhshrink(x)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -344,52 +444,224 @@ class TestTanhShrink(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestTanhshrinkAPI(unittest.TestCase):
+    # test paddle.nn.Tanhshrink, paddle.nn.functional.tanhshrink
+    def setUp(self):
+        self.x_np = np.random.uniform(10, 20, [10, 17]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.tanhshrink(x)
+            tanhshrink = paddle.nn.Tanhshrink()
+            out2 = tanhshrink(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_tanhshrink(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.tanhshrink(x)
+        tanhshrink = paddle.nn.Tanhshrink()
+        out2 = tanhshrink(x)
+        out_ref = ref_tanhshrink(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.tanh_shrink(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_tanhshrink(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.tanhshrink, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.tanhshrink, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.tanhshrink(x_fp16)
+
+
+def ref_hardshrink(x, threshold):
+    out = np.copy(x)
+    out[(out >= -threshold) & (out <= threshold)] = 0
+    return out
+
+
 class TestHardShrink(TestActivation):
     def setUp(self):
         self.op_type = "hard_shrink"
         self.init_dtype()
 
-        threshold = 0.5
+        self.threshold = 0.5
+        self.set_attrs()
         x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10
-        out = np.copy(x)
-        out[(out >= -threshold) & (out <= threshold)] = 0
+        out = ref_hardshrink(x, self.threshold)
 
-        self.attrs = {'lambda': threshold}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': self.threshold}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
+    def set_attrs(self):
+        pass
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
         self.check_grad(['X'], 'Out')
 
 
-class TestHardShrinkOpError(unittest.TestCase):
+class TestHardShrink_threshold_negative(TestHardShrink):
+    def set_attrs(self):
+        self.threshold = -0.1
+
+
+class TestHardShrinkAPI(unittest.TestCase):
+    # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.hardshrink(x)
+            hd = paddle.nn.Hardshrink()
+            out2 = hd(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardshrink(self.x_np, 0.5)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.hardshrink(x)
+        hd = paddle.nn.Hardshrink()
+        out2 = hd(x)
+        out_ref = ref_hardshrink(self.x_np, 0.5)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.hardshrink(x, 0.6)
+        hd = paddle.nn.Hardshrink(0.6)
+        out2 = hd(x)
+        out_ref = ref_hardshrink(self.x_np, 0.6)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12])
+            out = fluid.layers.hard_shrink(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardshrink(self.x_np, 0.5)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_shrink, 1)
+            self.assertRaises(TypeError, F.hardshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_shrink, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_shrink(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardshrink(x_fp16)
+
+
+def ref_hardtanh(x, min=-1.0, max=1.0):
+    out = np.copy(x)
+    out[np.abs(x - min) < 0.005] = min + 0.02
+    out[np.abs(x - max) < 0.005] = max + 0.02
+    out = np.minimum(np.maximum(x, min), max)
+    return out
+
+
+class TestHardtanhAPI(unittest.TestCase):
+    # test paddle.nn.Hardtanh, paddle.nn.functional.hardtanh
+    def setUp(self):
+        self.x_np = np.random.uniform(-3, 3, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.hardtanh(x)
+            m = paddle.nn.Hardtanh()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardtanh(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.hardtanh(x)
+        m = paddle.nn.Hardtanh()
+        out2 = m(x)
+        out_ref = ref_hardtanh(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.hardtanh(x, -2.0, 2.0)
+        m = paddle.nn.Hardtanh(-2.0, 2.0)
+        out2 = m(x)
+        out_ref = ref_hardtanh(self.x_np, -2.0, 2.0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.hardtanh, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardtanh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardtanh(x_fp16)
 
 
-class TestSoftShrink(TestActivation):
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
+class TestSoftshrink(TestActivation):
     def setUp(self):
         self.op_type = "softshrink"
         self.init_dtype()
 
-        lambda_val = 0.1
-        x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
-        out = np.copy(x)
-        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
-            out - lambda_val)
+        threshold = 0.8
 
-        self.attrs = {'lambda': lambda_val}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+        out = ref_softshrink(x, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {"lambda": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -398,17 +670,59 @@ class TestSoftShrink(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestSoftShrinkOpError(unittest.TestCase):
+class TestSoftshrinkAPI(unittest.TestCase):
+    # test paddle.nn.Softshrink, paddle.nn.functional.softshrink
+    def setUp(self):
+        self.threshold = 0.8
+        self.x_np = np.random.uniform(0.25, 10, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softshrink(x, self.threshold)
+            softshrink = paddle.nn.Softshrink(self.threshold)
+            out2 = softshrink(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softshrink(x, self.threshold)
+        softshrink = paddle.nn.Softshrink(self.threshold)
+        out2 = softshrink(x)
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softshrink(x, self.threshold)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softshrink(self.x_np, self.threshold)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.softshrink, 1)
+            self.assertRaises(TypeError, F.softshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.softshrink, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softshrink, x_int32)
+            # The threshold must be no less than zero
+            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            self.assertRaises(ValueError, F.softshrink, x_fp32, -1.0)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.softshrink(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softshrink(x_fp16)
 
 
 class TestSqrt(TestActivation, TestParameter):
@@ -594,7 +908,7 @@ class TestRelu(TestActivation):
         x[np.abs(x) < 0.005] = 0.02
         out = np.maximum(x, 0)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -603,32 +917,72 @@ class TestRelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestReluOpError(unittest.TestCase):
+class TestReluAPI(unittest.TestCase):
+    # test paddle.nn.ReLU, paddle.nn.functional.relu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.relu(x)
+            m = paddle.nn.ReLU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = np.maximum(self.x_np, 0)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.relu(x)
+        m = paddle.nn.ReLU()
+        out2 = m(x)
+        out_ref = np.maximum(self.x_np, 0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.relu, 1)
+            self.assertRaises(TypeError, F.relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.relu, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[10, 12], dtype='int32')
+            self.assertRaises(TypeError, F.relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.relu(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            F.relu(x_fp16)
+
+
+def ref_leaky_relu(x, alpha=0.01):
+    out = np.copy(x)
+    out[out < 0] *= alpha
+    return out
 
 
 class TestLeakyRelu(TestActivation):
+    def get_alpha(self):
+        return 0.02
+
     def setUp(self):
         self.op_type = "leaky_relu"
         self.init_dtype()
+        alpha = self.get_alpha()
 
+        np.random.seed(10)
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0.02 * x)
+        x[np.abs(x) < 0.005] = 0.05
+        out = ref_leaky_relu(x, alpha)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
+        self.attrs = {'alpha': alpha}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -636,18 +990,78 @@ class TestLeakyRelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestLeakyReluOpError(unittest.TestCase):
+class TestLeakyReluAlpha1(TestLeakyRelu):
+    def get_alpha(self):
+        return 2
+
+
+class TestLeakyReluAlpha2(TestLeakyRelu):
+    def get_alpha(self):
+        return -0.01
+
+
+class TestLeakyReluAlpha3(TestLeakyRelu):
+    def get_alpha(self):
+        return -2.0
+
+
+class TestLeakyReluAPI(unittest.TestCase):
+    # test paddle.nn.LeakyReLU, paddle.nn.functional.leaky_relu,
+    # fluid.layers.leaky_relu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.leaky_relu(x)
+            m = paddle.nn.LeakyReLU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_leaky_relu(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x_np)
+        out1 = F.leaky_relu(x)
+        m = paddle.nn.LeakyReLU()
+        out2 = m(x)
+        out_ref = ref_leaky_relu(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.leaky_relu(x, 0.6)
+        m = paddle.nn.LeakyReLU(0.6)
+        out2 = m(x)
+        out_ref = ref_leaky_relu(self.x_np, 0.6)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', [10, 12])
+            out = fluid.layers.leaky_relu(x, 0.01)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_leaky_relu(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.leaky_relu, 1)
+            self.assertRaises(TypeError, F.leaky_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.leaky_relu, x_int32)
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.leaky_relu(x_fp16)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.leaky_relu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.leaky_relu(x_fp16)
 
 
 def gelu(x, approximate):
@@ -667,7 +1081,7 @@ class TestGeluApproximate(TestActivation):
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         out = gelu(x, approximate)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {"approximate": approximate}
 
@@ -685,7 +1099,7 @@ class TestGelu(TestActivation):
         x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         out = gelu(x, approximate)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {"approximate": approximate}
 
@@ -695,6 +1109,55 @@ class TestGelu(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestGELUAPI(unittest.TestCase):
+    # test paddle.nn.GELU, paddle.nn.functional.gelu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out1 = F.gelu(x)
+            m = paddle.nn.GELU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = gelu(self.x_np, False)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.gelu(x)
+        m = paddle.nn.GELU()
+        out2 = m(x)
+        out_ref = gelu(self.x_np, False)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.gelu(x, True)
+        m = paddle.nn.GELU(True)
+        out2 = m(x)
+        out_ref = gelu(self.x_np, True)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.gelu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.gelu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            F.gelu(x_fp16)
+
+
 class TestBRelu(TestActivation):
     def setUp(self):
         self.op_type = "brelu"
@@ -734,20 +1197,24 @@ class TestBReluOpError(unittest.TestCase):
             fluid.layers.brelu(x_fp16)
 
 
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
 class TestRelu6(TestActivation):
     def setUp(self):
         self.op_type = "relu6"
         self.init_dtype()
 
         x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
-        threshold = 6.0
-        # The same with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
-        out = np.minimum(np.maximum(x, 0), threshold)
+        out = ref_relu6(x)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold}
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': 6.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -756,17 +1223,56 @@ class TestRelu6(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestRelu6OpError(unittest.TestCase):
+class TestRelu6API(unittest.TestCase):
+    # test paddle.nn.ReLU6, paddle.nn.functional.relu6
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 10, [10, 12]).astype(np.float64)
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.relu6(x)
+            relu6 = paddle.nn.ReLU6()
+            out2 = relu6(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_relu6(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.relu6(x)
+        relu6 = paddle.nn.ReLU6()
+        out2 = relu6(x)
+        out_ref = ref_relu6(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.relu6(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_relu6(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.relu6, 1)
+            self.assertRaises(TypeError, F.relu6, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.relu6, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.relu6, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.relu6(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.relu6(x_fp16)
 
 
 class TestHardSwish(TestActivation):
@@ -844,6 +1350,11 @@ class TestSoftReluOpError(unittest.TestCase):
             fluid.layers.soft_relu(x_fp16)
 
 
+def elu(x, alpha):
+    out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+    return out_ref.astype(x.dtype)
+
+
 class TestELU(TestActivation):
     def setUp(self):
         self.op_type = "elu"
@@ -851,7 +1362,7 @@ class TestELU(TestActivation):
 
         x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
         alpha = 1.
-        out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+        out = elu(x, alpha)
         # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
         # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
         self.inputs = {'X': x}
@@ -864,16 +1375,53 @@ class TestELU(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
-class TestELUOpError(unittest.TestCase):
+class TestELUAPI(unittest.TestCase):
+    # test paddle.nn.ELU, paddle.nn.functional.elu
+    def setUp(self):
+        self.x_np = np.random.uniform(-3, 3, [10, 12]).astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12])
+            out1 = F.elu(x)
+            m = paddle.nn.ELU()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = elu(self.x_np, 1.0)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.elu(x)
+        m = paddle.nn.ELU()
+        out2 = m(x)
+        out_ref = elu(self.x_np, 1.0)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.elu(x, 0.2)
+        m = paddle.nn.ELU(0.2)
+        out2 = m(x)
+        out_ref = elu(self.x_np, 0.2)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
     def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of elu_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.elu, x1)
-            # The input dtype of elu_op must be float16 float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.elu, x2)
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.elu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[10, 12], dtype='int32')
+            self.assertRaises(TypeError, F.elu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            F.elu(x_fp16)
 
 
 class TestReciprocal(TestActivation):
@@ -1107,16 +1655,25 @@ class TestSTanhOpError(unittest.TestCase):
             fluid.layers.stanh(x_fp16)
 
 
+def ref_softplus(x, beta=1, threshold=20):
+    x_beta = beta * x
+    out = np.select([x_beta <= threshold, x_beta > threshold],
+                    [np.log(1 + np.exp(x_beta)) / beta, x])
+    return out
+
+
 class TestSoftplus(TestActivation):
     def setUp(self):
         self.op_type = "softplus"
         self.init_dtype()
-        self.dtype = np.float64
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(1 + np.exp(x))
+        beta = 2
+        threshold = 15
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {'beta': beta, "threshold": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1125,15 +1682,72 @@ class TestSoftplus(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftplusAPI(unittest.TestCase):
+    # test paddle.nn.Softplus, paddle.nn.functional.softplus
+    def setUp(self):
+        self.beta = 2
+        self.threshold = 15
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softplus(x, self.beta, self.threshold)
+            softplus = paddle.nn.Softplus(self.beta, self.threshold)
+            out2 = softplus(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softplus(self.x_np, self.beta, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softplus(x, self.beta, self.threshold)
+        softplus = paddle.nn.Softplus(self.beta, self.threshold)
+        out2 = softplus(x)
+        out_ref = ref_softplus(self.x_np, self.beta, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softplus(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softplus(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softplus, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softplus, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softplus(x_fp16)
+
+
+def ref_softsign(x):
+    out = np.divide(x, 1 + np.abs(x))
+    return out
+
+
 class TestSoftsign(TestActivation):
     def setUp(self):
         self.op_type = "softsign"
         self.init_dtype()
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = np.divide(x, 1 + np.abs(x))
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_softsign(x)
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1142,6 +1756,57 @@ class TestSoftsign(TestActivation):
         self.check_grad(['X'], 'Out')
 
 
+class TestSoftsignAPI(unittest.TestCase):
+    # test paddle.nn.Softsign, paddle.nn.functional.softsign
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.softsign(x)
+            softsign = paddle.nn.Softsign()
+            out2 = softsign(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softsign(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softsign(x)
+        softsign = paddle.nn.Softsign()
+        out2 = softsign(x)
+        out_ref = ref_softsign(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.softsign(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_softsign(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softsign, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.softsign, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.softsign(x_fp16)
+
+
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
         self.op_type = "thresholded_relu"
@@ -1337,9 +2002,9 @@ create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
-create_test_act_fp16_class(TestTanhShrink)
+create_test_act_fp16_class(TestTanhshrink)
 create_test_act_fp16_class(TestHardShrink)
-create_test_act_fp16_class(TestSoftShrink)
+create_test_act_fp16_class(TestSoftshrink)
 create_test_act_fp16_class(TestSqrt)
 create_test_act_fp16_class(TestAbs)
 create_test_act_fp16_class(TestCeil, grad_check=False)
@@ -1372,140 +2037,5 @@ create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish)
 create_test_act_fp16_class(TestHardSwish)
 
-
-class TestNNReluAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 12]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return np.maximum(x, 0)
-
-    def ref_backward(self, y, dy):
-        y_t = y.copy()
-        y_t[y_t > 0] = 1
-        return y_t * dy
-
-    def check_api(self, place=fluid.CPUPlace(), inplace=False):
-        main_program = Program()
-        myrelu = nn.ReLU(inplace)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            x.stop_gradient = False
-            y = myrelu(x)
-            fluid.backward.append_backward(fluid.layers.mean(y))
-        exe = fluid.Executor(place)
-        out = exe.run(main_program,
-                      feed={'x': self.x},
-                      fetch_list=[y, y.grad_name, x.grad_name])
-        self.assertTrue(np.allclose(out[0], self.y))
-        self.assertTrue(np.allclose(out[2], self.ref_backward(self.y, out[1])))
-
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = myrelu(x)
-        self.assertTrue(np.allclose(y.numpy(), self.y))
-
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_api(place, inplace)
-
-
-class TestNNFunctionalReluAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 12]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return np.maximum(x, 0)
-
-    def test_check_api(self):
-        main_program = Program()
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.relu(x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], self.y))
-
-
-class TestNNSigmoidAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 15]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return 1 / (1 + np.exp(-x))
-
-    def ref_backward(self, y, dy):
-        return dy * y * (1 - y)
-
-    def check_api(self, place=fluid.CPUPlace(), inplace=False):
-        main_program = Program()
-        mysigmoid = nn.Sigmoid(inplace)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            x.stop_gradient = False
-            y = mysigmoid(x)
-            fluid.backward.append_backward(fluid.layers.mean(y))
-        exe = fluid.Executor(place)
-        out = exe.run(main_program,
-                      feed={'x': self.x},
-                      fetch_list=[y, y.grad_name, x.grad_name])
-        self.assertTrue(np.allclose(out[0], self.y))
-        self.assertTrue(np.allclose(out[2], self.ref_backward(self.y, out[1])))
-
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = mysigmoid(x)
-        self.assertTrue(np.allclose(y.numpy(), self.y))
-
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_api(place, inplace)
-
-
-class TestNNFunctionalSigmoidAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-
-    def init_data(self):
-        self.x_shape = [10, 15]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-
-    def ref_forward(self, x):
-        return 1 / (1 + np.exp(-x))
-
-    def test_check_api(self):
-        main_program = Program()
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.sigmoid(x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], self.y))
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 7a7099b7113c8233fb94074519386f9e4270a019..990499858ca52f5b471211aa659e64d3e13fccc3 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -20,6 +20,7 @@ from op_test import OpTest
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
+import paddle
 
 
 class TestAdamOp1(OpTest):
@@ -401,46 +402,111 @@ class TestAdamOpBetaVariable(OpTest):
         self.check_output()
 
 
-class TestAdamOptimizerBetaVariable(unittest.TestCase):
-    def test_adam_optimizer(self):
-        def test_with_place(place, shape):
-            exe = fluid.Executor(place)
-
-            train_prog = fluid.Program()
-            startup = fluid.Program()
-            with fluid.program_guard(train_prog, startup):
-                with fluid.unique_name.guard():
-                    data = fluid.data(name="data", shape=shape)
-                    conv = fluid.layers.conv2d(data, 8, 3)
-                    loss = fluid.layers.reduce_mean(conv)
-
-                    beta1 = fluid.layers.create_global_var(
-                        shape=[1],
-                        value=0.85,
-                        dtype='float32',
-                        persistable=True)
-                    beta2 = fluid.layers.create_global_var(
-                        shape=[1],
-                        value=0.95,
-                        dtype='float32',
-                        persistable=True)
-                    opt = fluid.optimizer.Adam(
-                        learning_rate=1e-5, beta1=beta1, beta2=beta2)
-                    opt.minimize(loss)
-
-            exe.run(startup)
-            data_np = np.random.random(shape).astype('float32')
-            rets = exe.run(train_prog,
-                           feed={"data": data_np},
-                           fetch_list=[loss])
-            assert rets[0] is not None
-
+class TestAdamOpV2(unittest.TestCase):
+    def test_adam_op(self):
+        place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            test_with_place(place, shape)
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = fluid.layers.reduce_mean(conv)
+
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True)
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.Adam(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+    def test_adam_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        linear = fluid.Linear(13, 5, dtype="float32")
+
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=linear.parameters())
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adam_op_with_state_dict(self):
+
+        import paddle
+        paddle.disable_static()
+        emb = paddle.nn.Embedding([10, 10])
+
+        adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+        #learning_rate is Decay
+        learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120)
+        adam = paddle.optimizer.Adam(
+            learning_rate=learning_rate,
+            weight_decay=fluid.regularizer.L2Decay(0.001),
+            parameters=emb.parameters())
+        lr = adam.get_lr()
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+        #leanrning_rate is Tensor
+        with self.assertRaises(TypeError):
+            learning_rate = np.array([0.01]).astype("float32")
+            learning_rate = paddle.to_tensor(learning_rate)
+            adam = paddle.optimizer.Adam(
+                learning_rate=learning_rate, parameters=emb.parameters())
+
+        params = adam.get_opti_var_name_list()
+        assert (params is not None)
+
+    def test_adam_with_grad_clip(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        linear = fluid.Linear(13, 5, dtype="float32")
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        adam = paddle.optimizer.Adam(
+            0.1, parameters=linear.parameters(), grad_clip=clip)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adam_op_with_set_lr(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+        lr = 0.01
+        adam.set_lr(lr)
+        cur_lr = adam.get_lr()
+        assert (lr == cur_lr)
+
+        lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32')
+        adam.set_lr(lr_var)
+        cur_lr = adam.get_lr()
+        assert (np.float32(lr) == cur_lr)
+
+        with self.assertRaises(TypeError):
+            lr = int(1)
+            adam.set_lr(lr)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6946dc80b5e55b2e7149f357fe0600916a4fe9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestAdamaxAPI(unittest.TestCase):
+    def test_adamax_api_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        adam = paddle.optimizer.Adamax(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adamax_api(self):
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+                beta1 = 0.85
+                beta2 = 0.95
+                opt = paddle.optimizer.Adamax(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb70d6e6400c8e7ae71cabf92ce8060e220a7da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.fluid as fluid
+
+
+class TestAdamWOp(unittest.TestCase):
+    def test_adamw_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_adamw_op_coverage(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_variable(value)
+        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.0,
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        assert (adam.__str__() is not None)
+
+    def test_adamw_op(self):
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True)
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.AdamW(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c30e3d2ade0725e6debcdd0a69ca4eee622aec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -0,0 +1,274 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+                            pool_type="avg"):
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        output_size = [H_out, W_out]
+    else:
+        H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = H
+        H_out = H
+    if output_size[1] == None:
+        output_size[1] = W
+        W_out = W
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+
+    for i in range(H_out):
+        in_h_start = adaptive_start_index(i, H, output_size[0])
+        in_h_end = adaptive_end_index(i, H, output_size[0])
+
+        for j in range(W_out):
+            in_w_start = adaptive_start_index(j, W, output_size[1])
+            in_w_end = adaptive_end_index(j, W, output_size[1])
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    field_size = (
+                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="avg",
+            data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[2, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[None, 3])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[2, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[3, 3], data_format="NHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool2d(
+                x=x, output_size=[None, 3])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[2, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool2d_forward(
+            x=self.x_np,
+            output_size=[3, 3],
+            pool_type="avg",
+            data_format="NHWC")
+
+        self.res_5_np = adaptive_pool2d_forward(
+            x=self.x_np, output_size=[None, 3], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[3, 3], data_format="NHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_avg_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[3, 3], data_format="NHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+                output_size=[None, 3])
+            out_5 = adaptive_avg_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..c04ee660667edaff01d7029e83b912c05429a15f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def adaptive_pool3d_forward(x,
+                            output_size,
+                            adaptive=True,
+                            data_format='NCDHW',
+                            pool_type='avg'):
+
+    N = x.shape[0]
+    C, D, H, W = [x.shape[1], x.shape[2], x.shape[3], x.shape[4]] \
+        if data_format == 'NCDHW' else [x.shape[4], x.shape[1], x.shape[2],x.shape[3]]
+
+    if (isinstance(output_size, int) or output_size == None):
+        H_out = output_size
+        W_out = output_size
+        D_out = output_size
+        output_size = [D_out, H_out, W_out]
+    else:
+        D_out, H_out, W_out = output_size
+
+    if output_size[0] == None:
+        output_size[0] = D
+        D_out = D
+    if output_size[1] == None:
+        output_size[1] = H
+        H_out = H
+    if output_size[2] == None:
+        output_size[2] = W
+        W_out = W
+
+    out = np.zeros((N, C, D_out, H_out, W_out)) if data_format=='NCDHW' \
+        else np.zeros((N, D_out, H_out, W_out, C))
+    for k in range(D_out):
+        d_start = adaptive_start_index(k, D, output_size[0])
+        d_end = adaptive_end_index(k, D, output_size[0])
+
+        for i in range(H_out):
+            h_start = adaptive_start_index(i, H, output_size[1])
+            h_end = adaptive_end_index(i, H, output_size[1])
+
+            for j in range(W_out):
+                w_start = adaptive_start_index(j, W, output_size[2])
+                w_end = adaptive_end_index(j, W, output_size[2])
+
+                if data_format == 'NCDHW':
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
+                                 w_end]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, :, k, i, j] = np.sum(x_masked,
+                                                    axis=(2, 3, 4)) / field_size
+                    elif pool_type == 'max':
+                        out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                elif data_format == 'NDHWC':
+                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
+                                 w_end, :]
+                    if pool_type == 'avg':
+                        field_size = (d_end - d_start) * (h_end - h_start) * (
+                            w_end - w_start)
+                        out[:, k, i, j, :] = np.sum(x_masked,
+                                                    axis=(1, 2, 3)) / field_size
+                    elif pool_type == 'max':
+                        out[:, k, i, j, :] = np.max(x_masked, axis=(1, 2, 3))
+    return out
+
+
+class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="avg",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3])
+
+            out_2 = paddle.nn.functional.adaptive_avg_pool3d(x=x, output_size=5)
+
+            out_3 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[2, 3, 5])
+
+            out_4 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[3, 3, 3], data_format="NDHWC")
+
+            out_5 = paddle.nn.functional.adaptive_avg_pool3d(
+                x=x, output_size=[None, 3, None])
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
+        self.res_1_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+
+        self.res_2_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=5, pool_type="avg")
+
+        self.res_3_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+
+        self.res_4_np = adaptive_pool3d_forward(
+            x=self.x_np,
+            output_size=[3, 3, 3],
+            pool_type="avg",
+            data_format="NDHWC")
+
+        self.res_5_np = adaptive_pool3d_forward(
+            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+
+    def test_static_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.enable_static()
+            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3], data_format="NDHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_avg_pool(x=x)
+
+            exe = paddle.static.Executor(place=place)
+            [res_1, res_2, res_3, res_4, res_5] = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_np},
+                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+            assert np.allclose(res_1, self.res_1_np)
+
+            assert np.allclose(res_2, self.res_2_np)
+
+            assert np.allclose(res_3, self.res_3_np)
+
+            assert np.allclose(res_4, self.res_4_np)
+
+            assert np.allclose(res_5, self.res_5_np)
+
+    def test_dynamic_graph(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            x = paddle.to_variable(self.x_np)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3])
+            out_1 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            out_2 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[2, 3, 5])
+            out_3 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[3, 3, 3], data_format="NDHWC")
+            out_4 = adaptive_avg_pool(x=x)
+
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+                output_size=[None, 3, None])
+            out_5 = adaptive_avg_pool(x=x)
+
+            assert np.allclose(out_1.numpy(), self.res_1_np)
+
+            assert np.allclose(out_2.numpy(), self.res_2_np)
+
+            assert np.allclose(out_3.numpy(), self.res_3_np)
+
+            assert np.allclose(out_4.numpy(), self.res_4_np)
+
+            assert np.allclose(out_5.numpy(), self.res_5_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 8c0b599a37936ffe47ac44ae54fe6e25768e4a4f..6e66c0c0029accdcdf81ae67dff1a49e3e8867d4 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -63,18 +63,104 @@ class TestAddMMOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of addmm_op must be Variable.
+
             input = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
             x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
             x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.addmm, input, x1, x2)
+
             # The input dtype of mul_op must be float32 or float64.
-            input = fluid.layers.data(name='input', shape=[4], dtype="int32")
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
-            x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
+            input = fluid.layers.data(
+                name='input',
+                shape=[4, 4],
+                dtype="int32",
+                append_batch_size=False)
+            x3 = fluid.layers.data(
+                name='x3', shape=[4, 4], dtype="int32", append_batch_size=False)
+            x4 = fluid.layers.data(
+                name='x4', shape=[4, 4], dtype="int32", append_batch_size=False)
             self.assertRaises(TypeError, paddle.addmm, input, x3, x4)
+            # x and y dimension mismatch
+            x5 = fluid.layers.data(
+                name='x5',
+                shape=[4, 5],
+                dtype="float32",
+                append_batch_size=False)
+            x6 = fluid.layers.data(
+                name='x6',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input, x5, x6)
+            # input and x are not broadcastable
+            x7 = fluid.layers.data(
+                name='x7',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x8 = fluid.layers.data(
+                name='x8',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input1 = fluid.layers.data(
+                name='input1',
+                shape=[2, 4],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input1, x7, x8)
+            # input and x are not broadcastable
+            x9 = fluid.layers.data(
+                name='x9',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x10 = fluid.layers.data(
+                name='x10',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input2 = fluid.layers.data(
+                name='input2',
+                shape=[1, 2],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input2, x9, x10)
+            x11 = fluid.layers.data(
+                name='x11',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x12 = fluid.layers.data(
+                name='x12',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input3 = fluid.layers.data(
+                name='input3',
+                shape=[4, 2],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input3, x11, x12)
+            x13 = fluid.layers.data(
+                name='x13',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            x14 = fluid.layers.data(
+                name='x14',
+                shape=[4, 4],
+                dtype="float32",
+                append_batch_size=False)
+            input4 = fluid.layers.data(
+                name='input4',
+                shape=[3, 1],
+                dtype="float32",
+                append_batch_size=False)
+            self.assertRaises(ValueError, paddle.addmm, input4, x13, x14)
 
 
 class TestAddMMOp2(TestAddMMOp):
@@ -147,5 +233,23 @@ class TestAddMMOp4(unittest.TestCase):
             assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy())
 
 
+'''
+class TestAddMMAPI(unittest.TestCase):
+    def test_api_error(self):
+        data_x = np.ones((2, 2)).astype(np.float32)
+        data_y = np.ones((2, 2)).astype(np.float32)
+        data_input = np.ones((2, 2)).astype(np.float32)
+
+        paddle.disable_static()
+
+        def test_error1():
+            data_x_wrong = np.ones((2, 3)).astype(np.float32)
+            x = paddle.to_variable(data_x_wrong)
+            y = paddle.to_variable(data_y)
+            input = paddle.to_variable(data_input)
+            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+        self.assertRaises(ValueError, test_error1)
+'''
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
index c524fb6930d97c0eb2971d09e751a54628d41325..6157314b1f060577a7d058f0de9a42f6368947ff 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -63,7 +63,7 @@ class TestAffineChannelOp(OpTest):
         self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
 
     def init_test_case(self):
-        self.shape = [2, 100, 12, 12]
+        self.shape = [2, 100, 3, 3]
         self.C = 100
         self.layout = 'NCHW'
 
@@ -102,7 +102,7 @@ class TestAffineChannelOpError(unittest.TestCase):
 
 class TestAffineChannelNHWC(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [2, 12, 12, 100]
+        self.shape = [2, 3, 3, 100]
         self.C = 100
         self.layout = 'NHWC'
 
@@ -115,7 +115,7 @@ class TestAffineChannelNHWC(TestAffineChannelOp):
 
 class TestAffineChannel2D(TestAffineChannelOp):
     def init_test_case(self):
-        self.shape = [8, 100]
+        self.shape = [2, 100]
         self.C = 100
         self.layout = 'NCHW'
 
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..c874cf197ea88c7f12b9b24223d40d22be268b10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class AffineGridTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 theta_shape=(20, 2, 3),
+                 output_shape=[20, 2, 5, 7],
+                 align_corners=True,
+                 dtype="float32",
+                 invalid_theta=False,
+                 variable_output_shape=False):
+        super(AffineGridTestCase, self).__init__(methodName)
+
+        self.theta_shape = theta_shape
+        self.output_shape = output_shape
+        self.align_corners = align_corners
+        self.dtype = dtype
+        self.invalid_theta = invalid_theta
+        self.variable_output_shape = variable_output_shape
+
+    def setUp(self):
+        self.theta = np.random.randn(*(self.theta_shape)).astype(self.dtype)
+
+    def fluid_layer(self, place):
+        # align_corners = True
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                theta_var = fluid.data(
+                    "input", self.theta_shape, dtype=self.dtype)
+                y_var = fluid.layers.affine_grid(theta_var, self.output_shape)
+        feed_dict = {"input": self.theta}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                theta_var = fluid.data(
+                    "input", self.theta_shape, dtype=self.dtype)
+                y_var = F.affine_grid(
+                    theta_var,
+                    self.output_shape,
+                    align_corners=self.align_corners)
+        feed_dict = {"input": self.theta}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_dygraph_layer(self):
+        theta_var = dg.to_variable(
+            self.theta) if not self.invalid_theta else "invalid"
+        output_shape = dg.to_variable(
+            self.
+            output_shape) if self.variable_output_shape else self.output_shape
+        y_var = F.affine_grid(
+            theta_var, output_shape, align_corners=self.align_corners)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        place = fluid.CPUPlace()
+        result1 = self.fluid_layer(place)
+        result2 = self.functional(place)
+        with dg.guard(place):
+            result3 = self.paddle_dygraph_layer()
+        if self.align_corners:
+            np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class AffineGridErrorTestCase(AffineGridTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_dygraph_layer()
+
+
+def add_cases(suite):
+    suite.addTest(AffineGridTestCase(methodName='runTest'))
+    suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=True))
+
+    suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=False))
+    suite.addTest(
+        AffineGridTestCase(
+            methodName='runTest', variable_output_shape=True))
+
+    suite.addTest(
+        AffineGridTestCase(
+            methodName='runTest',
+            theta_shape=(20, 2, 3),
+            output_shape=[20, 1, 7, 7],
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        AffineGridErrorTestCase(
+            methodName='runTest', output_shape="not_valid"))
+    suite.addTest(
+        AffineGridErrorTestCase(
+            methodName='runTest',
+            invalid_theta=True))  # to test theta not variable error checking
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 3668c4f4aa174e34dcc96d40ddae7b359c1bee18..55612d71a17a7ae9801535bf5a35c83b100aab30 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -17,14 +17,20 @@ import numpy as np
 from op_test import OpTest
 
 
-def AffineGrid(theta, size):
+def AffineGrid(theta, size, align_corners):
     n = size[0]
     w = size[3]
     h = size[2]
+    h_factor = w_factor = 1
+    if not align_corners:
+        h_factor = (h - 1) / float(h)
+        w_factor = (w - 1) / float(w)
     h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+        np.linspace(-1, 1, h)[np.newaxis, :], w,
+        axis=0).T[:, :, np.newaxis] * h_factor
     w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+        np.linspace(-1, 1, w)[np.newaxis, :], h,
+        axis=0)[:, :, np.newaxis] * w_factor
     grid = np.concatenate(
         [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
     grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
@@ -45,12 +51,17 @@ class TestAffineGridOp(OpTest):
         theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
         theta = np.ones(self.theta_shape).astype("float32")
         self.inputs = {'Theta': theta}
-        self.attrs = {"use_cudnn": True}
+        self.attrs = {
+            "use_cudnn": self.use_cudnn,
+            "align_corners": self.align_corners
+        }
         if self.dynamic_shape:
             self.inputs['OutputShape'] = self.output_shape
         else:
             self.attrs['output_shape'] = self.output_shape
-        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
+        self.outputs = {
+            'Output': AffineGrid(theta, self.output_shape, self.align_corners)
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -62,6 +73,8 @@ class TestAffineGridOp(OpTest):
         self.theta_shape = (17, 2, 3)
         self.output_shape = np.array([17, 2, 5, 7]).astype("int32")
         self.dynamic_shape = False
+        self.use_cudnn = False
+        self.align_corners = True
 
 
 class TestAffineGridOpCase1(TestAffineGridOp):
@@ -69,6 +82,35 @@ class TestAffineGridOpCase1(TestAffineGridOp):
         self.theta_shape = (20, 2, 3)
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
         self.dynamic_shape = True
+        self.use_cudnn = True
+        self.align_corners = True
+
+
+class TestAffineGridOpCase2(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (20, 2, 3)
+        self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+        self.use_cudnn = False
+        self.align_corners = True
+
+
+class TestAffineGridOpCase3(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (20, 2, 3)
+        self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+        self.use_cudnn = False
+        self.align_corners = False
+
+
+class TestAffineGridOpCase4(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (25, 2, 3)
+        self.output_shape = np.array([25, 2, 5, 6]).astype("int32")
+        self.dynamic_shape = False
+        self.use_cudnn = False
+        self.align_corners = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_op.py b/python/paddle/fluid/tests/unittests/test_allclose_op.py
index 5b5ed2641880ade671434185414fa45c26901a2d..dc50e569f80433a5730b1ea33a6f3b4922d99c91 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 
 
 class TestAllcloseOp(OpTest):
@@ -76,5 +77,58 @@ class TestAllcloseOpNanTrue(TestAllcloseOp):
         self.equal_nan = True
 
 
+class TestAllcloseDygraph(unittest.TestCase):
+    def test_api_case(self):
+        paddle.disable_static()
+        x_data = np.random.rand(10, 10)
+        y_data = np.random.rand(10, 10)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.allclose(x, y, rtol=1e-05, atol=1e-08)
+        expected_out = np.allclose(x_data, y_data, rtol=1e-05, atol=1e-08)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+
+class TestAllcloseError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+                result = paddle.allclose(x, y)
+
+        self.assertRaises(TypeError, test_x_dtype)
+
+        def test_y_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                y = paddle.data(name='y', shape=[10, 10], dtype='int32')
+                result = paddle.allclose(x, y)
+
+        self.assertRaises(TypeError, test_y_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+        y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+
+        def test_rtol():
+            result = paddle.allclose(x, y, rtol=True)
+
+        self.assertRaises(TypeError, test_rtol)
+
+        def test_atol():
+            result = paddle.allclose(x, y, rtol=True)
+
+        self.assertRaises(TypeError, test_atol)
+
+        def test_equal_nan():
+            result = paddle.allclose(x, y, equal_nan=1)
+
+        self.assertRaises(TypeError, test_equal_nan)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index 1736e49f3b67b380b88e53ac9876f3ccde53104c..29003d28e441c02e040a8d6cb9888e376521bc72 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import paddle
 from paddle.fluid import core
-from paddle import program_guard, Program
+from paddle.static import program_guard, Program
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -82,7 +82,7 @@ class TestArangeAPI(unittest.TestCase):
 
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             out = exe.run(fetch_list=[x1])
 
         expected_data = np.arange(0, 5, 1).astype(np.float32)
@@ -93,15 +93,16 @@ class TestArangeImperative(unittest.TestCase):
     def test_out(self):
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
-        with paddle.imperative.guard(place):
-            x1 = paddle.arange(0, 5, 1)
-            x2 = paddle.tensor.arange(5)
-            x3 = paddle.tensor.creation.arange(5)
-
-            start = paddle.imperative.to_variable(np.array([0], 'float32'))
-            end = paddle.imperative.to_variable(np.array([5], 'float32'))
-            step = paddle.imperative.to_variable(np.array([1], 'float32'))
-            x4 = paddle.arange(start, end, step, 'int64')
+        paddle.disable_static(place)
+        x1 = paddle.arange(0, 5, 1)
+        x2 = paddle.tensor.arange(5)
+        x3 = paddle.tensor.creation.arange(5)
+
+        start = paddle.to_variable(np.array([0], 'float32'))
+        end = paddle.to_variable(np.array([5], 'float32'))
+        step = paddle.to_variable(np.array([1], 'float32'))
+        x4 = paddle.arange(start, end, step, 'int64')
+        paddle.enable_static()
 
         expected_data = np.arange(0, 5, 1).astype(np.int64)
         for i in [x1, x2, x3, x4]:
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index 0201f0635a5afeb285cdbca3e8d526a1ff5032f2..3639c4dea0a3a12aa46d2875affeebd4c623a4dd 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -201,107 +201,5 @@ class BaseTestComplex2_2(OpTest):
             }
 
 
-class APT_ArgMaxTest(unittest.TestCase):
-    def test_output_result(self):
-        with fluid.program_guard(fluid.Program()):
-            data1 = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            data2 = fluid.data(name="Y", shape=[3], dtype="int64")
-            out = paddle.argmax(input=data1, out=data2)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(
-                feed={"X": np.random.rand(3, 4).astype("float32")},
-                fetch_list=[data2, out])
-            self.assertEqual((result[0] == result[1]).all(), True)
-
-    def test_basic(self):
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=1)
-
-            result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data, axis=0)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=0)
-
-            result = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            out = paddle.argmax(input=data, dtype="int32")
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            np_input = np.random.rand(3, 4).astype("float32")
-            expected_result = np.argmax(np_input, axis=1).astype(np.int32)
-
-            result = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data1 = fluid.data(name="X", shape=[3, 4], dtype="float32")
-            data2 = fluid.data(name="Y", shape=[3], dtype="int64")
-            out = paddle.argmax(input=data, out=data2)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(
-                feed={"X": np.random.rand(3, 4).astype("float32")},
-                fetch_list=[data2, out])
-        self.assertEqual((result[0] == result[1]).all(), True)
-
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[100], dtype="float32")
-            y_1 = paddle.argmax(x, name='arg_max_res')
-            self.assertEqual(('arg_max_res' in y_1.name), True)
-
-    def test_errors(self):
-        def test_dtype1():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.argmax(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_dtype1)
-
-        def test_dtype2():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float64")
-                paddle.argmax(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_dtype2)
-
-
-class TestArgMinMaxOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            def test_argmax_x_type():
-                x1 = [1, 2, 3]
-                output = fluid.layers.argmax(x=x1)
-
-            self.assertRaises(TypeError, test_argmax_x_type)
-
-            def test_argmin_x_type():
-                x2 = [1, 2, 3]
-                output = fluid.layers.argmin(x=x2)
-
-            self.assertRaises(TypeError, test_argmin_x_type)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c1f9d802c31ac2c3b244541936ba25018e1487a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+
+
+def create_kernel_case(op_type, numpy_op_type):
+    class ArgMinMaxKernelBaseCase(OpTest):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 0
+
+        def setUp(self):
+            np.random.seed(123)
+            self.initTestCase()
+            self.dims = (4, 5, 6)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {'Out': self.numpy_op(self.x, axis=self.axis)}
+
+        def test_check_output(self):
+            paddle.enable_static()
+            self.check_output()
+
+    class ArgMinMaxKernelCase0(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 1
+
+    class ArgMinMaxKernelCase1(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = 2
+
+    class ArgMinMaxKernelCase2(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = -1
+
+    class ArgMinMaxKernelCase3(ArgMinMaxKernelBaseCase):
+        def initTestCase(self):
+            self.op_type = op_type
+            self.numpy_op_type = numpy_op_type
+            self.axis = -2
+
+    class ArgMinMaxKernelCase4(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4, 5, 6)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "keepdims": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out': self.numpy_op(
+                    self.x, axis=self.axis).reshape((1, 5, 6))
+            }
+
+    class ArgMinMaxKernelCase5(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "flatten": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out': self.numpy_op(
+                    self.x.flatten(), axis=self.axis)
+            }
+
+    class ArgMinMaxKernelCase6(ArgMinMaxKernelBaseCase):
+        def setUp(self):
+            self.initTestCase()
+            self.dims = (4)
+            self.dtype = "float64"
+            self.x = (1000 * np.random.random(self.dims).astype(self.dtype))
+            self.inputs = {'X': self.x}
+            self.attrs = {"axis": self.axis, "flatten": True, "keepdims": True}
+            self.numpy_op = eval("np.%s" % (numpy_op_type))
+            self.outputs = {
+                'Out':
+                np.array(self.numpy_op(
+                    self.x.flatten(), axis=self.axis))
+            }
+
+    cls_name = "ArgMinMaxKernelBaseCase_%s" % (op_type)
+    ArgMinMaxKernelBaseCase.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelBaseCase
+
+    cls_name = "ArgMinMaxKernelCase0_%s" % (op_type)
+    ArgMinMaxKernelCase0.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase0
+
+    cls_name = "ArgMinMaxKernelCase1_%s" % (op_type)
+    ArgMinMaxKernelCase1.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase1
+
+    cls_name = "ArgMinMaxKernelCase2_%s" % (op_type)
+    ArgMinMaxKernelCase2.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase2
+
+    cls_name = "ArgMinMaxKernelCase3_%s" % (op_type)
+    ArgMinMaxKernelCase3.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase3
+
+    cls_name = "ArgMinMaxKernelCase4_%s" % (op_type)
+    ArgMinMaxKernelCase4.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase4
+
+    cls_name = "ArgMinMaxKernelCase5_%s" % (op_type)
+    ArgMinMaxKernelCase5.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase5
+
+    cls_name = "ArgMinMaxKernelCase6_%s" % (op_type)
+    ArgMinMaxKernelCase6.__name__ = cls_name
+    globals()[cls_name] = ArgMinMaxKernelCase6
+
+
+for op_type, numpy_op_type in zip(['arg_max', 'arg_min'], ['argmax', 'argmin']):
+    create_kernel_case(op_type, numpy_op_type)
+
+
+def create_test_case(op_type):
+    class ArgMaxMinTestCase(unittest.TestCase):
+        def setUp(self):
+            np.random.seed(123)
+            self.input_data = np.random.rand(10, 10).astype("float32")
+            self.places = []
+            self.places.append(fluid.CPUPlace())
+            if core.is_compiled_with_cuda():
+                self.places.append(paddle.CUDAPlace(0))
+            self.op = eval("paddle.%s" % (op_type))
+            self.numpy_op = eval("np.%s" % (op_type))
+
+        def run_static(self, place):
+            paddle.enable_static()
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data)
+                self.assertTrue((result_data == np.array(expected_data)).all(),
+                                True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=1)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data, axis=1)
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=-1)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(self.input_data, axis=-1)
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+
+                op = eval("paddle.%s" % (op_type))
+                result = op(data_var, axis=-1, keepdim=True)
+                exe = paddle.static.Executor(place)
+                result_data = exe.run(feed={"data": self.input_data},
+                                      fetch_list=[result])
+                expected_data = self.numpy_op(
+                    self.input_data, axis=-1).reshape((10, 1))
+                self.assertTrue((result_data == expected_data).all(), True)
+
+            with paddle.static.program_guard(paddle.static.Program()):
+                op = eval("paddle.%s" % (op_type))
+                data_var = paddle.static.data(
+                    name="data", shape=[10, 10], dtype="float32")
+                result = op(data_var, axis=-1, name="test_arg_api")
+                self.assertTrue("test_arg_api" in result.name)
+
+        def run_dygraph(self, place):
+            paddle.disable_static()
+            op = eval("paddle.%s" % (op_type))
+            data_tensor = paddle.to_tensor(self.input_data)
+
+            #case 1 
+            result_data = op(data_tensor)
+            excepted_data = self.numpy_op(self.input_data)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 2 
+            result_data = op(data_tensor, axis=1)
+            excepted_data = self.numpy_op(self.input_data, axis=1)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 3 
+            result_data = op(data_tensor, axis=-1)
+            excepted_data = self.numpy_op(self.input_data, axis=-1)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 4 
+            result_data = op(data_tensor, axis=-1, keepdim=True)
+            excepted_data = self.numpy_op(self.input_data, axis=-1)
+            excepted_data = excepted_data.reshape((10))
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            #case 5 
+            result_data = op(data_tensor, axis=-1, keepdim=True, dtype="int32")
+            self.assertTrue(result_data.numpy().dtype == np.int32)
+
+            # case for dim 4, 5, 6, for test case coverage
+            input_data = np.random.rand(5, 5, 5, 5)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            input_data = np.random.rand(4, 4, 4, 4, 4)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+            input_data = np.random.rand(3, 3, 3, 3, 3, 3)
+            excepted_data = self.numpy_op(input_data, axis=0)
+            result_data = op(paddle.to_tensor(input_data), axis=0)
+            self.assertTrue((result_data.numpy() == excepted_data).all(), True)
+
+        def test_case(self):
+            for place in self.places:
+                self.run_static(place)
+                self.run_dygraph(place)
+
+    cls_name = "ArgMaxMinTestCase_{}".format(op_type)
+    ArgMaxMinTestCase.__name__ = cls_name
+    globals()[cls_name] = ArgMaxMinTestCase
+
+
+for op_type in ['argmin', 'argmax']:
+    create_test_case(op_type)
+
+
+class TestArgMinMaxOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+
+            def test_argmax_x_type():
+                x1 = [1, 2, 3]
+                output = paddle.argmax(x=x1)
+
+            self.assertRaises(TypeError, test_argmax_x_type)
+
+            def test_argmin_x_type():
+                x2 = [1, 2, 3]
+                output = paddle.argmin(x=x2)
+
+            self.assertRaises(TypeError, test_argmin_x_type)
+
+            def test_argmax_attr_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, dtype="float32")
+
+            self.assertRaises(ValueError, test_argmax_attr_type)
+
+            def test_argmin_attr_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, dtype="float32")
+
+            self.assertRaises(ValueError, test_argmin_attr_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index eb19c8fd6b45cab65e9c9bced189478098bdb66c..2a8e0e6c7f0bcf4a779b4c098cd4af816e976205 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import paddle
 import paddle.fluid as fluid
-import paddle.imperative as imperative
 import paddle.fluid.layers as layers
 import numpy as np
 import six
@@ -384,20 +383,21 @@ class TestArgsortDygraph(unittest.TestCase):
             self.place = core.CPUPlace()
 
     def test_api_0(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.argsort(var_x)
-            self.assertEqual((np.argsort(self.input_data) == out.numpy()).all(),
-                             True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.argsort(var_x)
+        self.assertEqual((np.argsort(self.input_data) == out.numpy()).all(),
+                         True)
+        paddle.enable_static()
 
     def test_api_1(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.argsort(var_x, axis=-1)
-            self.assertEqual(
-                (np.argsort(
-                    self.input_data, axis=-1) == out.numpy()).all(),
-                True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.argsort(var_x, axis=-1)
+        self.assertEqual(
+            (np.argsort(
+                self.input_data, axis=-1) == out.numpy()).all(), True)
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd009db5fd00133c5bad7c8c52662002ebd03fa8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+
+logger = get_logger()
+
+
+class AutoCheckPointACLBase(AutoCheckpointBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def tearDown(self):
+        os.environ.clear()
+        os.environ.update(self._old_environ)
+
+        file_name = os.path.basename(__file__)
+        base_name = os.path.splitext(file_name)[0]
+        print("runnng name:", base_name)
+
+    def _run_normal(self):
+        exe, main_prog, startup_prog = self._generate()
+
+        save_dir = "./run_save_model"
+        fs = LocalFS()
+
+        fs.delete(save_dir)
+        logger.info("begin _run_normal")
+
+        compiled, data_loader, optimizer, loss, image, label = self._init_env(
+            exe, main_prog, startup_prog)
+        for i in range(3):
+            self.assertEqual(acp._get_train_epoch_range(), None)
+            self.assertEqual(acp.g_acp_type, None)
+            for data in data_loader():
+                self.assertEqual(acp.g_acp_type, None)
+                self.assertEqual(acp._get_train_epoch_range(), None)
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+        self.assertEqual(acp.g_acp_type, None)
+        self.assertEqual(acp._get_train_epoch_range(), None)
+
+        m1 = PaddleModel(exe, compiled)
+        m1.serialize(save_dir)
+
+        m2 = PaddleModel(exe, compiled)
+        m2.deserialize(save_dir)
+
+        logger.info("end _run_normal")
+        fs.delete(save_dir)
+
+    def _not_use_train(self):
+        logger.info("begin _not_use_train")
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog)
+
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            epochs.append(i)
+            for data in data_loader():
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+        self.assertEqual(epochs, [0, 1, 2])
+        logger.info("end _not_use_train")
+
+    def _run_save_0(self, break_epoch_no=None):
+        logger.info("begin _run_save_0")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog)
+
+        o = None
+        i = 0
+        name = None
+        for i in acp.train_epoch_range(3, 0):
+            o = acp._get_train_epoch_range()
+            name = o.name
+
+            for data in data_loader():
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+            self.assertEqual(len(o._exe_status), 1)
+
+            if break_epoch_no is not None:
+                if i == break_epoch_no:
+                    break
+
+        o = acp._get_train_epoch_range()
+        assert o == None, "now train epoch must not exits now"
+        if break_epoch_no is None:
+            self.assertEqual(i, 2)
+        else:
+            self.assertEqual(i, break_epoch_no)
+
+        fs.delete(save_dir)
+        logger.info("end _run_save_0")
+
+    def _run_load_0(self, break_epoch_no=None):
+        logger.info("begin _run_load_0")
+        exe, main_prog, startup_prog = self._generate()
+
+        fs = LocalFS()
+        save_dir = "./run_load_0"
+        fs.delete(save_dir)
+
+        compiled, data_loader, optimizer, loss, image, label = self._init_env(
+            exe, main_prog, startup_prog)
+
+        o = None
+        i = 0
+        check = False
+
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            epochs.append(i)
+
+            for data in data_loader():
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+        o = acp._get_train_epoch_range()
+        self.assertTrue(o == None, "now train epoch must not exits now")
+        self.assertEqual(i, 2)
+
+        if break_epoch_no is not None:
+            if break_epoch_no == 0:
+                self.assertEqual(epochs, [0, 1, 2])
+            elif break_epoch_no == 1:
+                self.assertEqual(epochs, [1, 2])
+            elif break_epoch_no == 2:
+                self.assertEqual(epochs, [2])
+        else:
+            self.assertEqual(epochs, [2])
+
+        fs.delete(save_dir)
+        logger.info("begin _run_load_0")
+
+    def _test_corner_epoch_no(self, break_epoch_no):
+        logger.info("begin test_corener_epoch_no")
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+        self._run_save_0(break_epoch_no=break_epoch_no)
+        self._reset_generator()
+        self._run_load_0(break_epoch_no=break_epoch_no)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        logger.info("end test_corener_epoch_no")
+
+
+class AutoCheckpointTest(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_0",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_0",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_0",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_normal(self):
+        logger.info("begin test_normal")
+        checker = acp._get_checker()
+
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._clear_envs()
+        self._reset_generator()
+        self._run_normal()
+        self._readd_envs()
+        logger.info("end test_normal")
+
+    def test_basic(self):
+        logger.info("begin test_basic")
+        checker = acp._get_checker()
+        self.assertEqual(checker.run_env, "PADDLE_EDL_AUTO_CHECKPOINT")
+        self.assertEqual(checker.platform, "PADDLE_CLOUD")
+        self.assertEqual(checker.save_checkpoint_inter, 0)
+        print(checker)
+
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+        self._run_save_0()
+
+        self._reset_generator()
+        self._run_load_0()
+
+        logger.info("end test_basic")
+
+    def test_not_use(self):
+        logger.info("begin test_not_use")
+
+        self._clear_envs()
+        self._reset_generator()
+        self._not_use_train()
+        self._readd_envs()
+
+        logger.info("end test_not_use")
+
+    def test_checker(self):
+        os.environ.pop("PADDLE_JOB_ID", None)
+        try:
+            checker = AutoCheckpointChecker()
+            self.assertFalse(True)
+        except Exception as e:
+            pass
+        os.environ["PADDLE_JOB_ID"] = "test_job_auto_1"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
new file mode 100644
index 0000000000000000000000000000000000000000..55173325f621f7333a7c3ca32a9c55becee72e5a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest1(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_1",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_1",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_1",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d72fa01008af55a83d7b9a19747a8d96fb74b2b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest2(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_2",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_2",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_2",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5382f7e328ed1afa2d7516cd0d8db2db659aadd7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest3(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_3",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_3",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_3",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        self._test_corner_epoch_no(2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..90db9595d92ef602c03fa7dd104484a4f6101a87
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTestDist(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_dist_basic",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_dist_basic",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_dist_basic",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_distributed_basic(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_distributed_basic")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        #basic
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog, minimize=False)
+
+        #fleet
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+        with fluid.program_guard(main_prog, startup_prog):
+            dist_optimizer = fleet.distributed_optimizer(optimizer)
+            dist_optimizer.minimize(loss)
+
+        exe.run(startup_prog)
+
+        o = None
+        i = 0
+        name = None
+        for i in acp.train_epoch_range(3, 0):
+            o = acp._get_train_epoch_range()
+            name = o.name
+            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))
+
+            for data in data_loader():
+                fetch = exe.run(fleet.main_program,
+                                feed=data,
+                                fetch_list=[loss])
+
+            self.assertEqual(len(o._exe_status), 1)
+
+        o = acp._get_train_epoch_range()
+        assert o == None, "now train epoch must not exits now"
+        self.assertEqual(i, 2)
+
+        fs.delete(save_dir)
+
+        logger.info("end test_distributed_basic")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c10cd0e9922859bf3bad2015587fc0a6b2ba5da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTestMul(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_dist_multiple",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_dist_multiple",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_dist_multiple",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_multiple(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_multiple")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        exe, main_prog1, startup_prog1 = self._generate()
+        _, main_prog2, startup_prog2 = self._generate()
+
+        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
+            self._init_env(exe, main_prog1, startup_prog1)
+
+        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
+            self._init_env(exe, main_prog2, startup_prog2)
+
+        o = None
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            for data in data_loader1():
+                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])
+
+            for data in data_loader2():
+                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])
+
+            o = acp._get_train_epoch_range()
+            self.assertEqual(len(o._exe_status), 2)
+            print(o._exe_status)
+            epochs.append(i)
+
+        o = acp._get_train_epoch_range()
+        self.assertTrue(o == None, "now train epoch must not exits now")
+        self.assertEqual(i, 2)
+        self.assertEqual(epochs, [0, 1, 2])
+
+        fs.delete(save_dir)
+        logger.info("end test_multiple")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index bc666c0de5be06be7529bced39071303430c8ace..875f6211a7fbd98463d98dff91d93cc1b431fc86 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -86,6 +86,31 @@ class TestBaseLayer(unittest.TestCase):
             ret = l()
             self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 
+    def test_add_parameter_with_error(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            param = net.create_parameter(shape=[1])
+
+            with self.assertRaises(TypeError):
+                net.add_parameter(10, param)
+
+            with self.assertRaises(KeyError):
+                net.add_parameter("param.name", param)
+
+            with self.assertRaises(KeyError):
+                net.add_parameter("", param)
+
+            with self.assertRaises(KeyError):
+                net.test_param = 10
+                net.add_parameter("test_param", param)
+
+            with self.assertRaises(TypeError):
+                net.add_parameter("no_param", 10)
+
+            load_param = net.create_parameter(shape=[1])
+            net._loaddict_holder[load_param.name] = load_param
+            net.add_parameter("load_param", load_param)
+
 
 class BufferLayer(fluid.Layer):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3c6e7d0492b2f4a98a595f015e3b9f4a19e24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 7d90bbd0357bcc93cf7a66e99082feeb7e254db4..6ec6fdb59f200ce1dc9b6418b7f11329f85ba5dd 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -17,7 +17,7 @@ from __future__ import division
 import unittest
 
 import paddle.fluid as fluid
-from paddle.io import BatchSampler, Dataset
+from paddle.io import BatchSampler, Dataset, Sampler, SequenceSampler, RandomSampler
 
 
 class RandomDataset(Dataset):
@@ -35,6 +35,72 @@ class RandomDataset(Dataset):
         return self.sample_num
 
 
+class TestSampler(unittest.TestCase):
+    def test_main(self):
+        dataset = RandomDataset(100, 10)
+        sampler = Sampler(dataset)
+        try:
+            iter(sampler)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+
+class TestSequenceSampler(unittest.TestCase):
+    def test_main(self):
+        dataset = RandomDataset(100, 10)
+        sampler = SequenceSampler(dataset)
+        assert len(sampler) == 100
+
+        for i, index in enumerate(iter(sampler)):
+            assert i == index
+
+
+class TestRandomSampler(unittest.TestCase):
+    def test_main(self):
+        dataset = RandomDataset(100, 10)
+        sampler = RandomSampler(dataset)
+        assert len(sampler) == 100
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 100))
+
+    def test_with_num_samples(self):
+        dataset = RandomDataset(100, 10)
+        sampler = RandomSampler(dataset, num_samples=50, replacement=True)
+        assert len(sampler) == 50
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+            assert i >= 0 and i < 100
+
+    def test_with_generator(self):
+        dataset = RandomDataset(100, 10)
+        generator = iter(range(0, 60))
+        sampler = RandomSampler(dataset, generator=generator)
+        assert len(sampler) == 100
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 60))
+
+    def test_with_generator_num_samples(self):
+        dataset = RandomDataset(100, 10)
+        generator = iter(range(0, 60))
+        sampler = RandomSampler(
+            dataset, generator=generator, num_samples=50, replacement=True)
+        assert len(sampler) == 50
+
+        rets = []
+        for i in iter(sampler):
+            rets.append(i)
+        assert tuple(sorted(rets)) == tuple(range(0, 50))
+
+
 class TestBatchSampler(unittest.TestCase):
     def setUp(self):
         self.num_samples = 1000
@@ -86,16 +152,18 @@ class TestBatchSamplerShuffle(TestBatchSampler):
         self.drop_last = True
 
 
-class TestBatchSamplerWithIndices(TestBatchSampler):
+class TestBatchSamplerWithSampler(TestBatchSampler):
     def init_batch_sampler(self):
+        dataset = RandomDataset(1000, 10)
+        sampler = SequenceSampler(dataset)
         bs = BatchSampler(
-            indices=list(range(self.num_samples)),
+            sampler=sampler,
             batch_size=self.batch_size,
             drop_last=self.drop_last)
         return bs
 
 
-class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
+class TestBatchSamplerWithSamplerDropLast(unittest.TestCase):
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -103,12 +171,22 @@ class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
         self.shuffle = False
         self.drop_last = True
 
+
+class TestBatchSamplerWithSamplerShuffle(unittest.TestCase):
+    def setUp(self):
+        self.num_samples = 1000
+        self.num_classes = 10
+        self.batch_size = 32
+        self.shuffle = True
+        self.drop_last = True
+
     def test_main(self):
         try:
             dataset = RandomDataset(self.num_samples, self.num_classes)
+            sampler = RandomSampler(dataset)
             bs = BatchSampler(
-                dataset=dataset,
-                indices=list(range(self.num_samples)),
+                sampler=sampler,
+                shuffle=self.shuffle,
                 batch_size=self.batch_size,
                 drop_last=self.drop_last)
             self.assertTrue(False)
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 21571e0981065a0a1e2a5db03e91b4df0ea55d9a..a8054295b41c1f6d0008c4f0a9fadb6f04c647fc 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -19,93 +19,189 @@ import unittest
 from op_test import OpTest
 
 
+def test_static_layer(place,
+                      input_np,
+                      label_np,
+                      reduction='mean',
+                      weight_np=None):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            bce_loss = paddle.nn.loss.BCELoss(
+                weight=weight, reduction=reduction)
+        else:
+            bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
+        res = bce_loss(input, label)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog,
+                                feed={"input": input_np,
+                                      "label": label_np}
+                                if weight_np is None else {
+                                    "input": input_np,
+                                    "label": label_np,
+                                    "weight": weight_np
+                                },
+                                fetch_list=[res])
+    return static_result
+
+
+def test_static_functional(place,
+                           input_np,
+                           label_np,
+                           reduction='mean',
+                           weight_np=None):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            res = paddle.nn.functional.binary_cross_entropy(
+                input, label, weight=weight, reduction=reduction)
+        else:
+            res = paddle.nn.functional.binary_cross_entropy(
+                input, label, reduction=reduction)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog,
+                                feed={"input": input_np,
+                                      "label": label_np}
+                                if weight_np is None else {
+                                    "input": input_np,
+                                    "label": label_np,
+                                    "weight": weight_np
+                                },
+                                fetch_list=[res])
+    return static_result
+
+
+def test_dygraph_layer(place,
+                       input_np,
+                       label_np,
+                       reduction='mean',
+                       weight_np=None):
+    paddle.disable_static()
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+        bce_loss = paddle.nn.loss.BCELoss(weight=weight, reduction=reduction)
+    else:
+        bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
+    dy_res = bce_loss(paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def test_dygraph_functional(place,
+                            input_np,
+                            label_np,
+                            reduction='mean',
+                            weight_np=None):
+    paddle.disable_static()
+    input = paddle.to_tensor(input_np)
+    label = paddle.to_tensor(label_np)
+
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+        dy_res = paddle.nn.functional.binary_cross_entropy(
+            input, label, weight=weight, reduction=reduction)
+    else:
+        dy_res = paddle.nn.functional.binary_cross_entropy(
+            input, label, reduction=reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_bceloss(input_np, label_np, reduction='mean', weight_np=None):
+    if weight_np is None:
+        expected = -1 * (label_np * np.log(input_np) +
+                         (1. - label_np) * np.log(1. - input_np))
+    else:
+        expected = -1 * weight_np * (label_np * np.log(input_np) +
+                                     (1. - label_np) * np.log(1. - input_np))
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
 class TestBCELoss(unittest.TestCase):
     def test_BCELoss(self):
-        input_np = np.random.random(size=(20, 30)).astype(np.float64)
-        label_np = np.random.random(size=(20, 30)).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
+        input_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
+        label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
         places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         reductions = ['sum', 'mean', 'none']
         for place in places:
-            for red in reductions:
-                with fluid.program_guard(prog, startup_prog):
-                    input = fluid.data(
-                        name='input', shape=[None, 30], dtype='float64')
-                    label = fluid.data(
-                        name='label', shape=[None, 30], dtype='float64')
-                    bce_loss = paddle.nn.loss.BCELoss(reduction=red)
-                    res = bce_loss(input, label)
-
-                    exe = fluid.Executor(place)
-                    static_result = exe.run(
-                        prog,
-                        feed={"input": input_np,
-                              "label": label_np},
-                        fetch_list=[res])
-
-                with fluid.dygraph.guard():
-                    bce_loss = paddle.nn.loss.BCELoss(reduction=red)
-                    dy_res = bce_loss(
-                        fluid.dygraph.to_variable(input_np),
-                        fluid.dygraph.to_variable(label_np))
-                    dy_result = dy_res.numpy()
-
-                expected = -1 * (label_np * np.log(input_np) +
-                                 (1. - label_np) * np.log(1. - input_np))
-                if red == 'mean':
-                    expected = np.mean(expected)
-                elif red == 'sum':
-                    expected = np.sum(expected)
-                else:
-                    expected = expected
+            for reduction in reductions:
+                static_result = test_static_layer(place, input_np, label_np,
+                                                  reduction)
+                dy_result = test_dygraph_layer(place, input_np, label_np,
+                                               reduction)
+                expected = calc_bceloss(input_np, label_np, reduction)
                 self.assertTrue(np.allclose(static_result, expected))
                 self.assertTrue(np.allclose(static_result, dy_result))
                 self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static_functional(place, input_np,
+                                                           label_np, reduction)
+                dy_functional = test_dygraph_functional(place, input_np,
+                                                        label_np, reduction)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_weight(self):
-        input_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        input_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
         weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[None, 3, 4, 10], dtype='float64')
-            label = fluid.data(
-                name='label', shape=[None, 3, 4, 10], dtype='float64')
-            weight = fluid.data(
-                name='weight', shape=[3, 4, 10], dtype='float64')
-            bce_loss = paddle.nn.loss.BCELoss(weight=weight)
-            res = bce_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(prog,
-                                    feed={
-                                        "input": input_np,
-                                        "label": label_np,
-                                        "weight": weight_np
-                                    },
-                                    fetch_list=[res])
-
-        with fluid.dygraph.guard():
-            bce_loss = paddle.nn.loss.BCELoss(
-                weight=fluid.dygraph.to_variable(weight_np))
-            dy_res = bce_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_res.numpy()
-
-        expected = np.mean(-1 * weight_np *
-                           (label_np * np.log(input_np) +
-                            (1. - label_np) * np.log(1. - input_np)))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+        for reduction in ['sum', 'mean', 'none']:
+            static_result = test_static_layer(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            dy_result = test_dygraph_layer(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            expected = calc_bceloss(
+                input_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            static_functional = test_static_functional(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            dy_functional = test_dygraph_functional(
+                place, input_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_functional, expected))
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+            self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCELoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(
+            ValueError, paddle.nn.loss.BCELoss, reduction="unsupport reduction")
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.binary_cross_entropy,
+            input=input,
+            label=label,
+            reduction="unsupport reduction")
+        paddle.enable_static()
 
 
 def bce_loss(input, label):
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ba13a6da01c7dbf8b0e854df43f11b19a4ebd4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+def call_bce_layer(logit, label, weight=None, reduction='mean',
+                   pos_weight=None):
+    bce_logit_loss = paddle.nn.loss.BCEWithLogitsLoss(
+        weight=weight, reduction=reduction, pos_weight=pos_weight)
+    res = bce_logit_loss(logit, label)
+    return res
+
+
+def call_bce_functional(logit,
+                        label,
+                        weight=None,
+                        reduction='mean',
+                        pos_weight=None):
+    res = paddle.nn.functional.binary_cross_entropy_with_logits(
+        logit, label, weight=weight, reduction=reduction, pos_weight=pos_weight)
+    return res
+
+
+def test_static(place,
+                logit_np,
+                label_np,
+                weight_np=None,
+                reduction='mean',
+                pos_weight_np=None,
+                functional=False):
+    paddle.enable_static()
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        feed_dict = {"logit": logit_np, "label": label_np}
+
+        pos_weight = None
+        weight = None
+        if pos_weight_np is not None:
+            pos_weight = paddle.data(
+                name='pos_weight', shape=pos_weight_np.shape, dtype='float64')
+            feed_dict["pos_weight"] = pos_weight_np
+        if weight_np is not None:
+            weight = paddle.data(
+                name='weight', shape=weight_np.shape, dtype='float64')
+            feed_dict["weight"] = weight_np
+        if functional:
+            res = call_bce_functional(logit, label, weight, reduction,
+                                      pos_weight)
+        else:
+            res = call_bce_layer(logit, label, weight, reduction, pos_weight)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 logit_np,
+                 label_np,
+                 weight_np=None,
+                 reduction='mean',
+                 pos_weight_np=None,
+                 functional=False):
+    paddle.disable_static()
+    logit = paddle.to_tensor(logit_np)
+    label = paddle.to_tensor(label_np)
+    weight = None
+    pos_weight = None
+    if weight_np is not None:
+        weight = paddle.to_tensor(weight_np)
+    if pos_weight_np is not None:
+        pos_weight = paddle.to_tensor(pos_weight_np)
+    if functional:
+        dy_res = call_bce_functional(logit, label, weight, reduction,
+                                     pos_weight)
+    else:
+        dy_res = call_bce_layer(logit, label, weight, reduction, pos_weight)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_bce_with_logits_loss(logit_np,
+                              label_np,
+                              reduction='mean',
+                              weight_np=None,
+                              pos_weight=None):
+    expected = np.maximum(
+        logit_np,
+        0) - logit_np * label_np + np.log(1 + np.exp(-np.abs(logit_np)))
+    if pos_weight is not None:
+        expected = expected * ((pos_weight - 1) * label_np + 1)
+    if weight_np is not None:
+        expected = weight_np * expected
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
+class TestBCEWithLogitsLoss(unittest.TestCase):
+    def test_BCEWithLogitsLoss(self):
+        logit_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
+        label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        for place in places:
+            for reduction in reductions:
+                static_result = test_static(
+                    place, logit_np, label_np, reduction=reduction)
+                dy_result = test_dygraph(
+                    place, logit_np, label_np, reduction=reduction)
+                expected = calc_bce_with_logits_loss(logit_np, label_np,
+                                                     reduction)
+                self.assertTrue(np.allclose(static_result, expected))
+                self.assertTrue(np.allclose(static_result, dy_result))
+                self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static(
+                    place,
+                    logit_np,
+                    label_np,
+                    reduction=reduction,
+                    functional=True)
+                dy_functional = test_dygraph(
+                    place,
+                    logit_np,
+                    label_np,
+                    reduction=reduction,
+                    functional=True)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_weight(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        for reduction in ['sum', 'mean', 'none']:
+            static_result = test_static(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction)
+            dy_result = test_dygraph(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction)
+            expected = calc_bce_with_logits_loss(
+                logit_np, label_np, reduction, weight_np=weight_np)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            static_functional = test_static(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction,
+                functional=True)
+            dy_functional = test_dygraph(
+                place,
+                logit_np,
+                label_np,
+                weight_np=weight_np,
+                reduction=reduction,
+                functional=True)
+            self.assertTrue(np.allclose(static_functional, expected))
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+            self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_pos_weight(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        pos_weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        reduction = "mean"
+        static_result = test_static(place, logit_np, label_np, weight_np,
+                                    reduction, pos_weight_np)
+        dy_result = test_dygraph(place, logit_np, label_np, weight_np,
+                                 reduction, pos_weight_np)
+        expected = calc_bce_with_logits_loss(logit_np, label_np, reduction,
+                                             weight_np, pos_weight_np)
+        self.assertTrue(np.allclose(static_result, expected))
+        self.assertTrue(np.allclose(static_result, dy_result))
+        self.assertTrue(np.allclose(dy_result, expected))
+        static_functional = test_static(
+            place,
+            logit_np,
+            label_np,
+            weight_np,
+            reduction,
+            pos_weight_np,
+            functional=True)
+        dy_functional = test_dygraph(
+            place,
+            logit_np,
+            label_np,
+            weight_np,
+            reduction,
+            pos_weight_np,
+            functional=True)
+        self.assertTrue(np.allclose(static_functional, expected))
+        self.assertTrue(np.allclose(static_functional, dy_functional))
+        self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_BCEWithLogitsLoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(
+            ValueError,
+            paddle.nn.BCEWithLogitsLoss,
+            reduction="unsupport reduction")
+        logit = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.binary_cross_entropy_with_logits,
+            logit=logit,
+            label=label,
+            reduction="unsupport reduction")
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a29de80426639ab3a9d2b879bb88a461ba2ab4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+from op_test import OpTest
+import numpy as np
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, bins=2)
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.5 * np.ones((2))
+    return hist, prob
+
+
+class TestBernoulliOp(OpTest):
+    def setUp(self):
+        self.op_type = "bernoulli"
+        self.inputs = {"X": np.random.uniform(size=(1000, 784))}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
+
+    def init_attrs(self):
+        self.attrs = {}
+        self.output_hist = output_hist
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        hist, prob = self.output_hist(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestBernoulliApi(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.rand([1024, 1024])
+        out = paddle.bernoulli(x)
+        paddle.enable_static()
+        hist, prob = output_hist(out.numpy())
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+    def test_static(self):
+        x = paddle.rand([1024, 1024])
+        out = paddle.bernoulli(x)
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        out = exe.run(paddle.static.default_main_program(),
+                      fetch_list=[out.name])
+        hist, prob = output_hist(out[0])
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_api.py b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eae4797de85f371ed62e78c85b160f698ee9eb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from op_test import OpTest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+
+
+class TestBilinearAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.program_guard(fluid.default_startup_program(),
+                                 fluid.default_main_program()):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+            else:
+                place = core.CPUPlace()
+            exe = fluid.Executor(place)
+
+            data1 = fluid.data(name='X1', shape=[5, 5], dtype='float32')
+            data2 = fluid.data(name='X2', shape=[5, 4], dtype='float32')
+
+            layer1 = np.random.random((5, 5)).astype('float32')
+            layer2 = np.random.random((5, 4)).astype('float32')
+
+            bilinear = paddle.nn.Bilinear(
+                in1_features=5, in2_features=4, out_features=1000)
+            ret = bilinear(data1, data2)
+
+            exe.run(fluid.default_startup_program())
+            ret_fetch = exe.run(feed={'X1': layer1,
+                                      'X2': layer2},
+                                fetch_list=[ret.name])
+            self.assertEqual(ret_fetch[0].shape, (5, 1000))
+
+
+class TestBilinearAPIDygraph(unittest.TestCase):
+    def test_api(self):
+        paddle.disable_static()
+        layer1 = np.random.random((5, 5)).astype('float32')
+        layer2 = np.random.random((5, 4)).astype('float32')
+        bilinear = paddle.nn.Bilinear(
+            in1_features=5, in2_features=4, out_features=1000)
+        ret = bilinear(paddle.to_tensor(layer1), paddle.to_tensor(layer2))
+        self.assertEqual(ret.shape, [5, 1000])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 5cc8e2ba15d260b988ee66a5711aed42ca04c10b..cc2b1165ec304a63671b48d4702142ea38c9a2c1 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,7 +65,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     """Bipartite Matching algorithm for batch input.
     Arg:
         distance (numpy.array) : The distance of two entries with shape [M, N].
-        lod (list of int): The offsets of each input in this batch.
+        lod (list of int): The length of each input in this batch.
     """
     n = len(lod)
     m = distance.shape[1]
@@ -73,6 +73,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     match_dist = np.zeros((n, m), dtype=np.float32)
     cur_offset = 0
     for i in range(n):
+        if lod[i] == 0: continue
         bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
                         match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
@@ -155,5 +156,22 @@ class TestBipartiteMatchOpWithPerPredictionType(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithEmptyLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[5, 6, 0, 12]]
+        dist = np.random.random((23, 217)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': (dist, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bmm_op.py b/python/paddle/fluid/tests/unittests/test_bmm_op.py
index 993ac25d8d4b638a56c9e2aa4f832f576f0b2ae7..cb1b3ded53472c022ef83539f573c9e6c192a966 100644
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -73,5 +73,15 @@ class API_TestDygraphBmm(unittest.TestCase):
         self.assertTrue(np.allclose(expected_result, out_np))
 
 
+class TestBmmAPIError(unittest.TestCase):
+    def test_api_error(self):
+        x_data = np.arange(24, dtype='float32').reshape((2, 3, 4))
+        y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
+        y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4))
+        y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2))
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1)
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a9d450e223f1e0bc84513f7069e0a104fa644e7c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+# use default values
+# FIXME: random fails on Unknown command lines -c (or -m).
+launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
+CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad75f2aa8bc06d2af24f5d22eb126a3558cd6f74
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+from paddle.fluid.incubate.checkpoint.auto_checkpoint import ExeTrainStatus
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
+
+
+class CheckpointerSaverTest(unittest.TestCase):
+    def test(self):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
+        dir_path = "./checkpointsaver_test"
+        fs.delete(dir_path)
+
+        s = CheckpointSaver(fs)
+
+        fs.mkdirs("{}/exe.exe".format(dir_path))
+        fs.mkdirs("{}/exe.1".format(dir_path))
+        fs.mkdirs("{}/exe".format(dir_path))
+
+        a = s.get_checkpoint_no(dir_path)
+        self.assertEqual(len(a), 0)
+
+        fs.mkdirs("{}/__paddle_checkpoint__.0".format(dir_path))
+        fs.mkdirs("{}/__paddle_checkpoint__.exe".format(dir_path))
+
+        a = s.get_checkpoint_no(dir_path)
+        self.assertEqual(len(a), 1)
+
+        s.clean_redundant_checkpoints(dir_path)
+        s.clean_redundant_checkpoints(dir_path)
+
+        fs.delete(dir_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index 4e2280c0118a11ebfc21f6179b8a7a795c6f53da..ab08a0aacbf08768ffff43974ee9a7c7dd4a7288 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -90,5 +90,55 @@ class TestCholeskyOp2D(TestCholeskyOp):
         self._input_shape = (64, 64)
 
 
+class TestDygraph(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        a = np.random.rand(3, 3)
+        a_t = np.transpose(a, [1, 0])
+        x_data = np.matmul(a, a_t) + 1e-03
+        x = paddle.to_variable(x_data)
+        out = paddle.cholesky(x, upper=False)
+
+
+class TestCholeskySingularAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place, with_out=False):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[4, 4], dtype="float64")
+            result = paddle.cholesky(input)
+
+            input_np = np.zeros([4, 4]).astype("float64")
+
+            exe = fluid.Executor(place)
+            try:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": input_np},
+                                  fetch_list=[result])
+            except fluid.core.EnforceNotMet as ex:
+                print("The mat is singular")
+                pass
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                     [[10, 11, 12], [13, 14, 15],
+                                      [16, 17, 18]]]).astype("float64")
+                input = fluid.dygraph.to_variable(input_np)
+                try:
+                    result = paddle.cholesky(input)
+                except fluid.core.EnforceNotMet as ex:
+                    print("The mat is singular")
+                    pass
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_op.py b/python/paddle/fluid/tests/unittests/test_chunk_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..043b326fbd98769f96688ef2eeaf23c53978c94d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_chunk_op.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import numpy as np
+from paddle.fluid import Program, program_guard
+from paddle import fluid
+import paddle
+
+
+class TestChunkOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The type of axis in chunk_op should be int or Variable.
+            def test_axis_type():
+                x1 = paddle.data(shape=[4], dtype='float16', name='x3')
+                paddle.chunk(x=x1, chunks=2, axis=3.2)
+
+            self.assertRaises(TypeError, test_axis_type)
+
+            # The type of axis in chunk op should be int or Variable.
+            def test_axis_variable_type():
+                x2 = paddle.data(shape=[4], dtype='float16', name='x9')
+                x3 = paddle.data(shape=[1], dtype='float16', name='x10')
+                paddle.chunk(input=x2, chunks=2, axis=x3)
+
+            self.assertRaises(TypeError, test_axis_variable_type)
+
+            # The type of num_or_sections in chunk_op should be int, tuple or list.
+            def test_chunks_type():
+                x4 = paddle.data(shape=[4], dtype='float16', name='x4')
+                paddle.chunk(input=x4, chunks=2.1, axis=3)
+
+            self.assertRaises(TypeError, test_chunks_type)
+
+            def test_axis_type_tensor():
+                x5 = paddle.data(shape=[4], dtype='float16', name='x6')
+                paddle.chunk(input=x5, chunks=2, axis=3.2)
+
+            self.assertRaises(TypeError, test_axis_type_tensor)
+
+
+class API_TestChunk(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = paddle.data('data1', shape=[4, 6, 6], dtype='float64')
+            data2 = paddle.data('data2', shape=[1], dtype='int32')
+            x0, x1, x2 = paddle.chunk(data1, chunks=3, axis=data2)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            input1 = np.random.random([4, 6, 6]).astype('float64')
+            input2 = np.array([2]).astype('int32')
+            r0, r1, r2, = exe.run(feed={"data1": input1,
+                                        "data2": input2},
+                                  fetch_list=[x0, x1, x2])
+            ex_x0, ex_x1, ex_x2 = np.array_split(input1, 3, axis=2)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+            self.assertTrue(np.allclose(ex_x2, r2))
+
+
+class API_TestChunk1(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = paddle.data('data1', shape=[4, 6, 6], dtype='float64')
+            x0, x1, x2 = paddle.chunk(data1, chunks=3, axis=2)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            input1 = np.random.random([4, 6, 6]).astype('float64')
+            r0, r1, r2, = exe.run(feed={"data1": input1},
+                                  fetch_list=[x0, x1, x2])
+            ex_x0, ex_x1, ex_x2 = np.array_split(input1, 3, axis=2)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+            self.assertTrue(np.allclose(ex_x2, r2))
+
+
+class API_TestDygraphChunk(unittest.TestCase):
+    def test_out1(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.chunk(input, chunks=3, axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.array_split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+    def test_out2(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([4, 6, 6]).astype("bool")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.chunk(input, chunks=3, axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.array_split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+    def test_axis_tensor_input(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            num1 = paddle.full(shape=[1], fill_value=1, dtype='int32')
+            x0, x1, x2 = paddle.chunk(input, chunks=3, axis=num1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.array_split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clamp.py b/python/paddle/fluid/tests/unittests/test_clamp.py
deleted file mode 100644
index d8d7fe01f8de8686724ea8ebc00491269f2cc0bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_clamp.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import paddle.tensor as tensor
-import paddle.fluid as fluid
-import numpy as np
-import unittest
-
-
-class TestClampAPI(unittest.TestCase):
-    def test_dygraph_clamp(self):
-        in1 = np.array([[1.2, 3.5], [4.5, 6.4]]).astype('float32')
-        with fluid.dygraph.guard():
-            x1 = fluid.dygraph.to_variable(in1)
-            out1 = tensor.clamp(x1, min=3.5, max=5.0)
-            out2 = tensor.clamp(x1, min=2.5)
-            self.assertTrue(
-                np.allclose(
-                    out1.numpy(), in1.clip(
-                        min=3.5, max=5.0)))
-            self.assertTrue(np.allclose(out2.numpy(), in1.clip(min=2.5)))
-
-    def test_clamp(self):
-        data_shape = [1, 9, 9, 4]
-        data = np.random.random(data_shape).astype('float32')
-        images = fluid.data(name='image', shape=data_shape, dtype='float32')
-        min = fluid.data(name='min', shape=[1], dtype='float32')
-        max = fluid.data(name='max', shape=[1], dtype='float32')
-
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        out_1 = tensor.clamp(images, min=min, max=max)
-        out_2 = tensor.clamp(images, min=0.2, max=0.9)
-        out_3 = tensor.clamp(images, min=0.3)
-        out_4 = tensor.clamp(images, max=0.7)
-        out_5 = tensor.clamp(images, min=min)
-        out_6 = tensor.clamp(images, max=max)
-
-        res1, res2, res3, res4, res5, res6 = exe.run(
-            fluid.default_main_program(),
-            feed={
-                "image": data,
-                "min": np.array([0.2]).astype('float32'),
-                "max": np.array([0.8]).astype('float32')
-            },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
-
-        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
-        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
-        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
-        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
-        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
-        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
-
-
-class TestClampError(unittest.TestCase):
-    def test_errors(self):
-        x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
-        x2 = fluid.layers.data(name='x2', shape=[1], dtype="int8")
-        self.assertRaises(TypeError, tensor.clamp, x=x1, min=0.2, max=0.8)
-        self.assertRaises(TypeError, tensor.clamp, x=x2, min=0.2, max=0.8)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 33bbd4c8830d689bed513b9ce4084c3d00a923a8..93b31f052aae14546effef1696d2719dfff6727b 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from op_test import OpTest
@@ -109,5 +110,64 @@ class TestClipOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestClipAPI(unittest.TestCase):
+    def test_clip(self):
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = fluid.data(name='image', shape=data_shape, dtype='float32')
+        min = fluid.data(name='min', shape=[1], dtype='float32')
+        max = fluid.data(name='max', shape=[1], dtype='float32')
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        out_1 = paddle.clip(images, min=min, max=max)
+        out_2 = paddle.clip(images, min=0.2, max=0.9)
+        out_3 = paddle.clip(images, min=0.3)
+        out_4 = paddle.clip(images, max=0.7)
+        out_5 = paddle.clip(images, min=min)
+        out_6 = paddle.clip(images, max=max)
+
+        res1, res2, res3, res4, res5, res6 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "image": data,
+                "min": np.array([0.2]).astype('float32'),
+                "max": np.array([0.8]).astype('float32')
+            },
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
+
+        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
+        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
+        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
+        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
+
+    def test_clip_dygraph(self):
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        paddle.disable_static(place)
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = paddle.to_variable(data, dtype='float32')
+
+        out_1 = paddle.clip(images, min=0.2, max=0.8)
+        out_2 = paddle.clip(images, min=0.2, max=0.9)
+
+        self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+
+    def test_errors(self):
+        paddle.enable_static()
+        x1 = fluid.data(name='x1', shape=[1], dtype="int16")
+        x2 = fluid.data(name='x2', shape=[1], dtype="int8")
+        x3 = fluid.data(name='x3', shape=[1], dtype="float32")
+        self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
+        self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        self.assertRaises(Exception, paddle.clip, x=x3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..71777df4651ea26c7cf5dfc7231018288c2670e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveAllgatherAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allgather_nccl(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "nccl")
+
+    def test_allgather_gloo(self):
+        self.check_with_place("collective_allgather_api.py", "allgather",
+                              "gloo", "3")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24dd7cacff6adc56eb059a7bec016a1d3e322825
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_nccl(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "nccl")
+
+    def test_allreduce_gloo(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "gloo", "2")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..b04bd0cbdefbd6618cfc2b2a865c07a52a67d16b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import time
+import argparse
+import os
+import six
+import sys
+import subprocess
+import traceback
+import functools
+import pickle
+from contextlib import closing
+from six import string_types
+import paddle.fluid as fluid
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def wait_server_ready(self, endpoints):
+        assert not isinstance(endpoints, string_types)
+        while True:
+            all_ok = True
+            not_ready_endpoints = []
+            for ep in endpoints:
+                ip_port = ep.split(":")
+                with closing(
+                        socket.socket(socket.AF_INET,
+                                      socket.SOCK_STREAM)) as sock:
+                    sock.settimeout(2)
+                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                    if result != 0:
+                        all_ok = False
+                        not_ready_endpoints.append(ep)
+            if not all_ok:
+                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+                sys.stderr.write("not ready endpoints:" + str(
+                    not_ready_endpoints) + "\n")
+                sys.stderr.flush()
+                time.sleep(3)
+            else:
+                break
+
+    def initCommunicator(self, program, rank, nranks, wait_port,
+                         current_endpoint, endpoints):
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            self.wait_server_ready(other_endpoints)
+        block = program.global_block()
+        nccl_id_var = block.create_var(
+            name=nameGen.generate('nccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id
+            })
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        result = self.get_model(train_prog, startup_prog, rank)
+        if args['backend'] == 'nccl':
+            self.initCommunicator(startup_prog, rank, nranks, True,
+                                  current_endpoint, endpoints)
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(
+                device_id)  #if args.use_gpu else fluid.CPUPlace()
+        else:
+            strategy = fluid.core.GlooParallelStrategy()
+            strategy.rank = rank
+            strategy.rank_num = nranks
+            strategy.prefix = ""
+            strategy.iface = "lo"
+            strategy.init_seconds = 999999
+            strategy.run_seconds = 999999
+            strategy.path = "/tmp/tmp%d" % args['path_id']
+            gloo = fluid.core.GlooParallelContext(strategy)
+            gloo.init()
+            place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        np.random.seed(os.getpid())
+        indata = np.random.random((10, 1000))
+        fetch_list = []
+        for elem in result:
+            fetch_list.append(elem.name)
+        out = exe.run(train_prog,
+                      feed={'tindata': indata},
+                      fetch_list=fetch_list)
+        if six.PY2:
+            print(pickle.dumps(out))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_gpus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_gpus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         backend="nccl",
+                         path_id="0",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "GLOG_v": "0",
+            "NCCL_P2P_DISABLE": "1",
+            "BACKEND": backend,
+            "PATH_ID": path_id
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000))
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000))
+        if col_type == "allgather":
+            need_result = np.vstack((input1, input2))
+            tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
+            tr_out1 = np.vstack((tr1_out[0], tr1_out[1]))
+            self.assertTrue(np.allclose(tr_out0, need_result))
+            self.assertTrue(np.allclose(tr_out1, need_result))
+        elif col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "reduce":
+            need_result = input1 + input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+        elif col_type == "scatter":
+            need_result = input2
+            need_result1 = need_result[0:need_result.shape[0] // 2]
+            need_result2 = need_result[need_result.shape[0] // 2:]
+            self.assertTrue(np.allclose(tr0_out, need_result1))
+            self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf86f6ae14f1ecbdb3711378c84a3c1ce4967fb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveBarrierAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_barrier_nccl(self):
+        self.check_with_place("collective_barrier_api.py", "barrier", "nccl")
+
+    def test_barrier_gloo(self):
+        self.check_with_place("collective_barrier_api.py", "barrier", "gloo",
+                              "5")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index 3f3a5642abc242e994844d0aac1b79cbf664e4d4..512b2967e02fd01e67f416c2fd9222ae8589d8d8 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -241,6 +241,15 @@ class TestDistBase(unittest.TestCase):
             need_result = input2
             self.assertTrue(np.allclose(tr0_out, need_result))
             self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "reduce":
+            need_result = input1 + input2
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "scatter":
+            need_result = input2
+            need_result1 = need_result[0:need_result.shape[0] // 2]
+            need_result2 = need_result[need_result.shape[0] // 2:]
+            self.assertTrue(np.allclose(tr0_out, need_result1))
+            self.assertTrue(np.allclose(tr1_out, need_result2))
         elif col_type == "allreduce":
             need_result = input1 + input2
             self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf4f1ac4c822ad578f5ee0e0268324de5e5e25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_nccl(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "nccl")
+
+    def test_broadcast_gloo(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "gloo", "0")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..36837d6a227febd02e6ef1e2aeb905de19ca8acc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_base import TestDistBase
+
+
+class TestCReduceOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_reduce(self):
+        self.check_with_place("collective_reduce_op.py", "reduce")
+
+    def test_reduce_calc_stream(self):
+        self.check_with_place("collective_reduce_op_calc_stream.py", "reduce")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3975f3fc1c6959ffbb28a51543ebfef00c52e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveReduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_reduce_nccl(self):
+        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_gloo(self):
+        self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe3ce73359559c0f9b4e0e3990032ce693aab8a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_base import TestDistBase
+
+
+class TestCScatterOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_scatter(self):
+        self.check_with_place("collective_scatter_op.py", "scatter")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cae842b396111f004b7ce52ce3f40c20ebe57263
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from test_collective_api_base import TestDistBase
+
+
+class TestCollectiveScatterAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_scatter_gloo(self):
+        self.check_with_place("collective_scatter_api.py", "scatter", "gloo",
+                              "4")
+
+    def test_scatter_nccl(self):
+        self.check_with_place("collective_scatter_api.py", "scatter", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 46cae41f3045486837e33722b6c75f91859b65ba..30207340a27db0c1d00ab982cbac716e4b639c7e 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -25,33 +25,45 @@ import numpy
 import paddle
 import paddle.fluid as fluid
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
 
 
 class TestCommunicatorGeoEnd2End(unittest.TestCase):
     def net(self):
         x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x1 = fluid.layers.data(name='x1', shape=[1], dtype='int64', lod_level=1)
+
+        emb = fluid.layers.embedding(
+            input=x1,
+            size=[10000, 10],
+            param_attr=fluid.ParamAttr(
+                name="embedding",
+                initializer=fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+
+        pool = fluid.layers.sequence_pool(input=emb, pool_type="sum")
+        z = fluid.layers.concat(input=[x, pool], axis=1)
+        y_predict = fluid.layers.fc(input=z, size=1, act=None)
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
         avg_cost = fluid.layers.mean(cost)
-        return avg_cost, x, y
+        return avg_cost, x, x1, y
 
     def fake_reader(self):
         def reader():
             for i in range(10000):
                 x = numpy.random.random((1, 13)).astype('float32')
+                z = numpy.random.randint(0, 9999, (1, 1)).astype('int64')
                 y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
-                yield x, y
+                yield x, z, y
 
         return reader
 
     def run_pserver(self, role, strategy):
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = fluid.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
@@ -64,33 +76,41 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         exe = fluid.Executor(place)
 
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = fluid.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
         fleet.init_worker()
-        exe.run(fleet.startup_program)
+        exe.run(fluid.default_startup_program())
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
-        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])
 
         for batch_id, data in enumerate(train_reader()):
-            exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[])
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 
         fleet.stop_worker()
 
     def run_ut(self):
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER
-            if training_role == "TRAINER" else role_maker.Role.SERVER,
-            worker_num=1,
-            server_endpoints=["127.0.0.1:18099"])
+        os.environ["PADDLE_PSERVER_NUMS"] = "1"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001"
+
+        role = role_maker.PaddleCloudRoleMaker()
 
-        strategy = StrategyFactory.create_geo_strategy(10)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
 
         if training_role == "TRAINER":
             self.run_trainer(role, strategy)
@@ -116,8 +136,7 @@ import paddle.fluid as fluid
 from paddle.fluid.communicator import Communicator
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet as fleet
 
 from test_communicator_geo import TestCommunicatorGeoEnd2End
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index be1f32fb0aee10fa08f6609eeda06579145aeb26..c0044d9d620796057cce0e3a51b2dec2878a0e17 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -16,16 +16,13 @@ from __future__ import print_function
 
 import unittest
 import time
-import threading
-import numpy
 
+import os
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.communicator import Communicator
 
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
 
 
 class TestCommunicator(unittest.TestCase):
@@ -39,19 +36,24 @@ class TestCommunicator(unittest.TestCase):
         return avg_cost
 
     def test_communicator_sync(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=2,
-            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
 
-        fleet.init(role)
+        fleet.init(role_maker.PaddleCloudRoleMaker())
         avg_cost = self.net()
 
         optimizer = fluid.optimizer.SGD(0.01)
 
-        strategy = StrategyFactory.create_sync_strategy()
-        strategy._program_config.wait_port = False
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index a97f54d6cac1ea91f05cb3dc68729f5b68df7c9e..cfad50409802d4f3d35c9da3b22597c681da91b1 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -93,11 +93,12 @@ def create_paddle_case(op_type, callback):
 
         def test_broadcast_api_1(self):
             with program_guard(Program(), Program()):
-                x = paddle.nn.data(name='x', shape=[1, 2, 1, 3], dtype='int32')
-                y = paddle.nn.data(name='y', shape=[1, 2, 3], dtype='int32')
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
-                exe = paddle.Executor(self.place)
+                exe = paddle.static.Executor(self.place)
                 input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
                 input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
                 real_result = callback(input_x, input_y)
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 48b597ab282351739fcca894aa69685a13a9688f..b4dbba7eead397c46c37a8df013dabb00177f030 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -268,9 +268,9 @@ class TestConcatAPI(unittest.TestCase):
         out_3 = paddle.concat(x=[x_2, x_3], axis=positive_1_int64)
         out_4 = paddle.concat(x=[x_2, x_3], axis=negative_int64)
 
-        exe = paddle.Executor(place=paddle.CPUPlace())
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
         [res_1, res_2, res_3, res_4] = exe.run(
-            paddle.default_main_program(),
+            paddle.static.default_main_program(),
             feed={"x_1": input_2,
                   "x_2": input_2,
                   "x_3": input_3},
@@ -284,14 +284,15 @@ class TestConcatAPI(unittest.TestCase):
         in1 = np.array([[1, 2, 3], [4, 5, 6]])
         in2 = np.array([[11, 12, 13], [14, 15, 16]])
         in3 = np.array([[21, 22], [23, 24]])
-        with paddle.imperative.guard():
-            x1 = paddle.imperative.to_variable(in1)
-            x2 = paddle.imperative.to_variable(in2)
-            x3 = paddle.imperative.to_variable(in3)
-            out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
-            out2 = paddle.concat(x=[x1, x2], axis=0)
-            np_out1 = np.concatenate([in1, in2, in3], axis=-1)
-            np_out2 = np.concatenate([in1, in2], axis=0)
+        paddle.disable_static()
+        x1 = paddle.to_variable(in1)
+        x2 = paddle.to_variable(in2)
+        x3 = paddle.to_variable(in3)
+        out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
+        out2 = paddle.concat(x=[x1, x2], axis=0)
+        np_out1 = np.concatenate([in1, in2, in3], axis=-1)
+        np_out2 = np.concatenate([in1, in2], axis=0)
+        paddle.enable_static()
         self.assertEqual((out1.numpy() == np_out1).all(), True)
         self.assertEqual((out2.numpy() == np_out2).all(), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..da527b26bf0608da5a648d92b492ff27cf2802f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class Conv1dTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 spartial_shape=(16, ),
+                 num_channels=6,
+                 num_filters=8,
+                 filter_size=3,
+                 padding=0,
+                 padding_mode="zeros",
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 no_bias=False,
+                 dtype="float32",
+                 data_format="NCL"):
+        super(Conv1dTestCase, self).__init__(methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.spartial_shape = spartial_shape
+        self.filter_size = filter_size
+        self.data_format = data_format
+        self.channel_last = (self.data_format == "NHWC")
+
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.dtype = dtype
+
+    def setUp(self):
+        input_shape = (self.batch_size, self.num_channels
+                       ) + self.spartial_shape if not self.channel_last else (
+                           self.batch_size, ) + self.spartial_shape + (
+                               self.num_channels, )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size]
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (self.num_filters, self.num_channels
+                                            // self.groups) + tuple(filter_size)
+        self.weight = np.random.uniform(
+            -1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(
+                -1, 1, size=(self.num_filters, )).astype(self.dtype)
+        else:
+            self.bias = None
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                input_shape = (-1, self.num_channels,
+                               -1) if not self.channel_last else (
+                                   -1, -1, self.num_channels)
+                x_var = fluid.data("input", input_shape, dtype=self.dtype)
+                w_var = fluid.data(
+                    "weight", self.weight_shape, dtype=self.dtype)
+                b_var = fluid.data(
+                    "bias", (self.num_filters, ), dtype=self.dtype)
+                y_var = F.conv1d(
+                    x_var,
+                    w_var,
+                    b_var if not self.no_bias else None,
+                    padding=self.padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        conv = nn.Conv1d(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            padding_mode=self.padding_mode,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format)
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.functional(place)
+        with dg.guard(place):
+            result2 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class Conv1dErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
+    suite.addTest(Conv1dTestCase(methodName='runTest', stride=2, dilation=(1)))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', padding="same", no_bias=True))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', filter_size=3, padding='valid'))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', padding=2, data_format='NLC'))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
+    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest', groups=2, padding="valid"))
+    suite.addTest(
+        Conv1dTestCase(
+            methodName='runTest',
+            num_filters=6,
+            num_channels=3,
+            groups=3,
+            padding="valid",
+            data_format='NLC'))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding_mode="reflect", padding="valid"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', data_format="VALID"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', num_filters=8, num_channels=15, groups=3))
+    suite.addTest(
+        Conv1dErrorTestCase(
+            methodName='runTest', padding=[1, 2, 3, 4, 5]))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..73227dd3610376d85fcfc70bb2653dfd927427fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import paddle.fluid.initializer as I
+import unittest
+
+
+class ConvTranspose1dTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 batch_size=4,
+                 spartial_shape=16,
+                 in_channels=6,
+                 out_channels=8,
+                 filter_size=3,
+                 output_size=None,
+                 padding=0,
+                 output_padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 no_bias=False,
+                 data_format="NCL",
+                 dtype="float32"):
+        super(ConvTranspose1dTestCase, self).__init__(methodName)
+        self.batch_size = batch_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spartial_shape = spartial_shape
+        self.filter_size = filter_size
+        self.output_size = output_size
+
+        self.padding = padding
+        self.output_padding = output_padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.data_format = data_format
+        self.dtype = dtype
+
+    def setUp(self):
+
+        self.channel_last = False if self.data_format == "NCL" else True
+        input_shape = (self.batch_size, self.in_channels,
+                       self.spartial_shape) if not self.channel_last else (
+                           self.batch_size,
+                           self.spartial_shape,
+                           self.in_channels, )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size]
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (self.in_channels, self.out_channels
+                                            // self.groups) + tuple(filter_size)
+        self.weight = np.random.uniform(
+            -1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(
+                -1, 1, size=(self.out_channels, )).astype(self.dtype)
+        else:
+            self.bias = None
+
+    def functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                input_shape = (-1, self.in_channels,
+                               -1) if not self.channel_last else (
+                                   -1, -1, self.in_channels)
+                x_var = fluid.data("input", input_shape, dtype=self.dtype)
+                w_var = fluid.data(
+                    "weight", self.weight_shape, dtype=self.dtype)
+                b_var = fluid.data(
+                    "bias", (self.out_channels, ), dtype=self.dtype)
+                y_var = F.conv_transpose1d(
+                    x_var,
+                    w_var,
+                    None if self.no_bias else b_var,
+                    output_size=self.output_size,
+                    padding=self.padding,
+                    output_padding=self.output_padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        conv = nn.ConvTranspose1d(
+            self.in_channels,
+            self.out_channels,
+            self.filter_size,
+            padding=self.padding,
+            output_padding=self.output_padding,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format)
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var, output_size=self.output_size)
+        y_np = y_var.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.functional(place)
+        with dg.guard(place):
+            result2 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class ConvTranspose1dErrorTestCase(ConvTranspose1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(ValueError):
+                self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest'))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', stride=[2], no_bias=True, dilation=2))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            filter_size=(3),
+            output_size=[36],
+            stride=[2],
+            dilation=2))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', stride=2, dilation=(2)))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', padding='valid'))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', filter_size=1, padding=3))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[2]))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', data_format="NLC"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', groups=2, padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            out_channels=6,
+            in_channels=3,
+            groups=3,
+            padding="valid"))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest',
+            data_format="NLC",
+            spartial_shape=16,
+            output_size=18))
+    suite.addTest(
+        ConvTranspose1dTestCase(
+            methodName='runTest', data_format="NLC", stride=3,
+            output_padding=2))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', data_format="not_valid"))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', in_channels=5, groups=2))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', stride=2, output_padding=3))
+    suite.addTest(
+        ConvTranspose1dErrorTestCase(
+            methodName='runTest', output_size="not_valid"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index 64653ce2e7b8630030094b4004ecb17d56d3ff43..6bfe2aca530ddea6b49f12ad34dd9672e2a99ab5 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -20,6 +20,10 @@ import paddle.fluid.initializer as I
 import unittest
 
 
+def _reverse_repeat_list(t, n):
+    return list(x for x in reversed(t) for _ in range(n))
+
+
 class Conv2DTestCase(unittest.TestCase):
     def __init__(self,
                  methodName='runTest',
@@ -29,12 +33,11 @@ class Conv2DTestCase(unittest.TestCase):
                  num_filters=8,
                  filter_size=3,
                  padding=0,
+                 padding_mode='zeros',
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCHW",
                  dtype="float32"):
         super(Conv2DTestCase, self).__init__(methodName)
@@ -45,12 +48,16 @@ class Conv2DTestCase(unittest.TestCase):
         self.filter_size = filter_size
 
         self.padding = padding
+        if padding_mode in {'reflect', 'replicate', 'circular'}:
+            _paired_padding = fluid.layers.utils.convert_to_list(padding, 2,
+                                                                 'padding')
+            self._reversed_padding_repeated_twice = _reverse_repeat_list(
+                _paired_padding, 2)
+        self.padding_mode = padding_mode
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -91,19 +98,27 @@ class Conv2DTestCase(unittest.TestCase):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
+                if self.padding_mode != 'zeros':
+                    x_var = F.pad(x_var,
+                                  self._reversed_padding_repeated_twice,
+                                  mode=self.padding_mode,
+                                  data_format=self.data_format)
+                    padding = 0
+                else:
+                    padding = self.padding
+
                 y_var = fluid.layers.conv2d(
                     x_var,
                     self.num_filters,
                     self.filter_size,
-                    padding=self.padding,
+                    padding=padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
+
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -122,16 +137,24 @@ class Conv2DTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
+
+                if self.padding_mode != 'zeros':
+                    x_var = F.pad(x_var,
+                                  self._reversed_padding_repeated_twice,
+                                  mode=self.padding_mode,
+                                  data_format=self.data_format)
+                    padding = 0
+                else:
+                    padding = self.padding
+
                 y_var = F.conv2d(
                     x_var,
                     w_var,
                     b_var if not self.no_bias else None,
-                    padding=self.padding,
+                    padding=padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -143,18 +166,16 @@ class Conv2DTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2D(
+        conv = nn.Conv2d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
             padding=self.padding,
+            padding_mode=self.padding_mode,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -198,7 +219,7 @@ def add_cases(suite):
             methodName='runTest', stride=2, dilation=(2, 1)))
     suite.addTest(
         Conv2DTestCase(
-            methodName='runTest', padding="same", no_bias=True, act="sigmoid"))
+            methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
         Conv2DTestCase(
             methodName='runTest', filter_size=(3, 3), padding='valid'))
@@ -222,15 +243,28 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='reflect'))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='replicate'))
+    suite.addTest(
+        Conv2DTestCase(
+            methodName='runTest',
+            filter_size=(3, 3),
+            padding=1,
+            padding_mode='circular'))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv2DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 989836d5993af5620a7b5fbd86c07b028e419fc4..ba450b345b8a309f5d7ff1e7a5c149809f55f46c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -29,13 +29,12 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                  num_filters=8,
                  filter_size=3,
                  output_size=None,
+                 output_padding=0,
                  padding=0,
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCHW",
                  dtype="float32"):
         super(Conv2DTransposeTestCase, self).__init__(methodName)
@@ -45,14 +44,13 @@ class Conv2DTransposeTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.output_size = output_size
+        self.output_padding = output_padding
 
         self.padding = padding
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -93,6 +91,7 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
+
                 y_var = fluid.layers.conv2d_transpose(
                     x_var,
                     self.num_filters,
@@ -104,8 +103,6 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -125,17 +122,22 @@ class Conv2DTransposeTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv2d_transpose(
+
+                if self.output_padding != 0:
+                    output_size = None
+                else:
+                    output_size = self.output_size
+
+                y_var = F.conv_transpose2d(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
-                    output_size=self.output_size,
+                    output_size=output_size,
                     padding=self.padding,
+                    output_padding=self.output_padding,
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -147,32 +149,38 @@ class Conv2DTransposeTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2DTranspose(
+
+        if self.output_padding != 0:
+            output_size = None
+        else:
+            output_size = self.output_size
+
+        conv = nn.ConvTranspose2d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
-            output_size=self.output_size,
             padding=self.padding,
+            output_padding=self.output_padding,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
+        y_var = conv(x_var, output_size)
         y_np = y_var.numpy()
         return y_np
 
     def _test_equivalence(self, place):
         place = fluid.CPUPlace()
+
         result1 = self.fluid_layer(place)
         result2 = self.functional(place)
+
         with dg.guard(place):
             result3 = self.paddle_nn_layer()
+
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
 
@@ -194,7 +202,7 @@ class Conv2DTransposeErrorTestCase(Conv2DTransposeTestCase):
 
 
 def add_cases(suite):
-    suite.addTest(Conv2DTransposeTestCase(methodName='runTest', act="relu"))
+    suite.addTest(Conv2DTransposeTestCase(methodName='runTest'))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', stride=[1, 2], no_bias=True, dilation=2))
@@ -211,9 +219,6 @@ def add_cases(suite):
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', padding="valid"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding='valid'))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest', filter_size=1, padding=(2, 3)))
@@ -240,15 +245,22 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
+    suite.addTest(
+        Conv2DTransposeTestCase(
+            methodName='runTest',
+            num_filters=6,
+            num_channels=3,
+            spartial_shape=(7, 7),
+            filter_size=[5, 5],
+            groups=1,
+            padding=2,
+            stride=2,
+            output_size=[14, 14],
+            output_padding=[1, 1], ))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv2DTransposeErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index f4418150e8a69d795ff544073b6ba6dd7431e44b..913db51da500b6c324abfab61744dfc1947bf7a5 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -77,8 +77,13 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
         output_size = attrs['output_size']
         out_h = output_size[0] + pad_h_0 + pad_h_1
         out_w = output_size[1] + pad_w_0 + pad_w_1
-
-    out = np.zeros((in_n, out_c, out_h, out_w), dtype=input_.dtype)
+    out_pad_h = 0
+    out_pad_w = 0
+    if 'output_padding' in attrs:
+        out_pad_h = attrs['output_padding'][0]
+        out_pad_w = attrs['output_padding'][1]
+    out = np.zeros(
+        (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype)
 
     for n in range(in_n):
         for i in range(in_h):
@@ -99,7 +104,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
                         out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
                             dilations[1]] += tmp_out
 
-    out = out[:, :, pad_h_0:out_h - pad_h_1, pad_w_0:out_w - pad_w_1]
+    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h, pad_w_0:out_w - pad_w_1
+              + out_pad_w]
     if attrs['data_format'] == 'NHWC':
         out = np.transpose(out, [0, 2, 3, 1])
     return out
@@ -114,6 +120,7 @@ class TestConv2dTransposeOp(OpTest):
         self.use_cudnn = False
         self.use_mkldnn = False
         self.output_size = None
+        self.output_padding = []
         self.data_format = "NCHW"
         self.pad = [0, 0]
         self.padding_algorithm = "EXPLICIT"
@@ -138,6 +145,9 @@ class TestConv2dTransposeOp(OpTest):
         if self.output_size is not None:
             self.attrs['output_size'] = self.output_size
 
+        if len(self.output_padding) > 0:
+            self.attrs['output_padding'] = self.output_padding
+
         output = conv2dtranspose_forward_naive(input_, filter_,
                                                self.attrs).astype(self.dtype)
 
@@ -290,6 +300,18 @@ class TestWithEvenUpsample(TestConv2dTransposeOp):
         self.filter_size = [f_c, 6, 5, 5]
 
 
+class TestWithEvenUpsampleOutputPadding(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 3, 7, 7]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 5, 5]
+
+
 class Test_NHWC(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0]
@@ -375,6 +397,19 @@ class TestWithEvenUpsample_NHWC(TestConv2dTransposeOp):
         self.data_format = 'NHWC'
 
 
+class TestWithEvenUpsample_NHWC_output_padding(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 7, 7, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 5, 5]
+        self.data_format = 'NHWC'
+
+
 # ------------ test_cudnn ------------
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index cf582c6210b76c6546de6d09d9219dbf4005bb17..56355a1c95e0396d0dec53cae02c3a99bf874013 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -32,9 +32,7 @@ class Conv3DTestCase(unittest.TestCase):
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCDHW",
                  dtype="float32"):
         super(Conv3DTestCase, self).__init__(methodName)
@@ -48,9 +46,7 @@ class Conv3DTestCase(unittest.TestCase):
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -101,8 +97,6 @@ class Conv3DTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -130,8 +124,6 @@ class Conv3DTestCase(unittest.TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -143,7 +135,7 @@ class Conv3DTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3D(
+        conv = nn.Conv3d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
@@ -151,10 +143,7 @@ class Conv3DTestCase(unittest.TestCase):
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -225,15 +214,10 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv3DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index acaf33467dbfc1c580ab3a36f08d0c2a26d7c239..e30f0cd3ecd0b872efa53c85e0666e4a6fb00a88 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -33,9 +33,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                  stride=1,
                  dilation=1,
                  groups=1,
-                 act=None,
                  no_bias=False,
-                 use_cudnn=True,
                  data_format="NCDHW",
                  dtype="float32"):
         super(Conv3DTransposeTestCase, self).__init__(methodName)
@@ -50,9 +48,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
         self.stride = stride
         self.dilation = dilation
         self.groups = groups
-        self.act = act
         self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
         self.data_format = data_format
         self.dtype = dtype
 
@@ -104,8 +100,6 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     groups=self.groups,
                     param_attr=weight_attr,
                     bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -125,7 +119,7 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d_transpose(
+                y_var = F.conv_transpose3d(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
@@ -134,8 +128,6 @@ class Conv3DTransposeTestCase(unittest.TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
                     data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
@@ -147,23 +139,19 @@ class Conv3DTransposeTestCase(unittest.TestCase):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3DTranspose(
+        conv = nn.ConvTranspose3d(
             self.num_channels,
             self.num_filters,
             self.filter_size,
-            output_size=self.output_size,
             padding=self.padding,
             stride=self.stride,
             dilation=self.dilation,
             groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
+            data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
+        y_var = conv(x_var, self.output_size)
         y_np = y_var.numpy()
         return y_np
 
@@ -194,7 +182,7 @@ class Conv3DTransposeErrorTestCase(Conv3DTransposeTestCase):
 
 
 def add_cases(suite):
-    suite.addTest(Conv3DTransposeTestCase(methodName='runTest', act="tanh"))
+    suite.addTest(Conv3DTransposeTestCase(methodName='runTest'))
     suite.addTest(
         Conv3DTransposeTestCase(
             methodName='runTest', stride=[1, 2, 1], dilation=2, no_bias=True))
@@ -240,15 +228,10 @@ def add_cases(suite):
             num_filters=6,
             num_channels=3,
             groups=3,
-            use_cudnn=False,
-            act="sigmoid",
             padding="valid"))
 
 
 def add_error_cases(suite):
-    suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
     suite.addTest(
         Conv3DTransposeErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e25613fa63da440f71f23841095f153e61735e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -0,0 +1,140 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+
+from paddle.fluid import Program, program_guard, Executor, default_main_program
+
+
+class TestCosineSimilarityAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def _get_numpy_out(self, x1, x2, axis=1, eps=1e-8):
+        w12 = np.sum(x1 * x2, axis=axis)
+        w1 = np.sum(x1 * x1, axis=axis)
+        w2 = np.sum(x2 * x2, axis=axis)
+        n12 = np.sqrt(np.clip(w1 * w2, eps * eps, None))
+        cos_sim = w12 / n12
+        return cos_sim
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+
+        with program_guard(Program(), Program()):
+            shape = [10, 15]
+            axis = 1
+            eps = 1e-8
+            np.random.seed(0)
+            np_x1 = np.random.rand(*shape).astype(np.float32)
+            np_x2 = np.random.rand(*shape).astype(np.float32)
+
+            x1 = paddle.data(name="x1", shape=shape)
+            x2 = paddle.data(name="x2", shape=shape)
+            result = F.cosine_similarity(x1, x2, axis=axis, eps=eps)
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x1": np_x1,
+                                    "x2": np_x2},
+                              fetch_list=[result])
+
+            np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+            self.assertTrue(np.allclose(fetches[0], np_out))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph_1(self):
+        paddle.disable_static()
+
+        shape = [10, 15]
+        axis = 1
+        eps = 1e-8
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape).astype(np.float32)
+        np_x2 = np.random.rand(*shape).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        shape = [12, 13]
+        axis = 0
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape).astype(np.float32)
+        np_x2 = np.random.rand(*shape).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_3(self):
+        paddle.disable_static()
+
+        shape1 = [10, 12, 10]
+        shape2 = [10, 1, 10]
+        axis = 2
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape1).astype(np.float32)
+        np_x2 = np.random.rand(*shape2).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+    def test_dygraph_4(self):
+        paddle.disable_static()
+
+        shape1 = [23, 12, 1]
+        shape2 = [23, 1, 10]
+        axis = 2
+        eps = 1e-6
+        np.random.seed(1)
+        np_x1 = np.random.rand(*shape1).astype(np.float32)
+        np_x2 = np.random.rand(*shape2).astype(np.float32)
+        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
+
+        cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
+        tesnor_x1 = paddle.to_variable(np_x1)
+        tesnor_x2 = paddle.to_variable(np_x2)
+        y = cos_sim_func(tesnor_x1, tesnor_x2)
+
+        self.assertTrue(np.allclose(y.numpy(), np_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 7f667d6b71c7f52f6d5afb42045c2da0cc45587b..4982cd195820811b9a8ec3fe6d01955234032120 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -535,5 +535,443 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
 
+class FuncCrossEntropyLoss(unittest.TestCase):
+    #1
+    def test_cross_entropy_loss_1d_with_weight_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np)[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #2
+    def test_cross_entropy_loss_1d_with_weight_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #3
+    def test_cross_entropy_loss_1d_with_weight_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        weight_np = np.random.random([200]).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #4
+    def test_cross_entropy_loss_1d_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np)[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #5
+    def test_cross_entropy_loss_1d_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #6
+    def test_cross_entropy_loss_1d_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float64)
+        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            label = fluid.data(name='label', shape=[100], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={'input': input_np,
+                                       'label': label_np},
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #7
+    def test_cross_entropy_loss_2d_with_weight_none(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #8
+    def test_cross_entropy_loss_2d_with_weight_mean(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='mean')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='mean')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #9
+    def test_cross_entropy_loss_2d_with_weight_sum(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, weight=weight, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #10
+    def test_cross_entropy_loss_2d_none(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='none')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='none')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #11
+    def test_cross_entropy_loss_2d_mean(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='mean')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='mean')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    #12
+    def test_cross_entropy_loss_2d_sum(self):
+        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
+        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+            ret = paddle.nn.functional.cross_entropy(
+                input, label, reduction='sum')
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            dy_ret = paddle.nn.functional.cross_entropy(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np),
+                reduction='sum')
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(input_np, label_np, reduction='sum')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index a1a80bfdb549fe509171d4ed3d320547aa5aec51..ad121fac8cc045e67cf116d2cf9cedd6ac9bef99 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -17,9 +17,92 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+from paddle import to_variable
+
+
+class TestCumsumOp(unittest.TestCase):
+    def run_cases(self):
+        data_np = np.arange(12).reshape(3, 4)
+        data = to_variable(data_np)
+
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, dtype='float64')
+        self.assertTrue(y.dtype == core.VarDesc.VarType.FP64)
+
+        y = paddle.cumsum(data, dtype=np.int32)
+        self.assertTrue(y.dtype == core.VarDesc.VarType.INT32)
+
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+    def run_static(self, use_gpu=False):
+        with fluid.program_guard(fluid.Program()):
+            data_np = np.random.random((100, 100)).astype(np.float32)
+            x = paddle.static.data('X', [100, 100])
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, dtype='float64')
+            y5 = paddle.cumsum(x, dtype=np.int32)
+            y6 = paddle.cumsum(x, axis=-2)
+
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            out = exe.run(feed={'X': data_np},
+                          fetch_list=[
+                              y.name, y2.name, y3.name, y4.name, y5.name,
+                              y6.name
+                          ])
+
+            z = np.cumsum(data_np)
+            self.assertTrue(np.allclose(z, out[0]))
+            z = np.cumsum(data_np, axis=0)
+            self.assertTrue(np.allclose(z, out[1]))
+            z = np.cumsum(data_np, axis=-1)
+            self.assertTrue(np.allclose(z, out[2]))
+            self.assertTrue(out[3].dtype == np.float64)
+            self.assertTrue(out[4].dtype == np.int32)
+            z = np.cumsum(data_np, axis=-2)
+            self.assertTrue(np.allclose(z, out[5]))
+
+    def test_cpu(self):
+        paddle.disable_static(paddle.fluid.CPUPlace())
+        self.run_cases()
+        paddle.enable_static()
+
+        self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        paddle.disable_static(paddle.fluid.CUDAPlace(0))
+        self.run_cases()
+        paddle.enable_static()
+
+        self.run_static(use_gpu=True)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.cumsum(x, name='out')
+            self.assertTrue('out' in y.name)
 
 
 class TestSumOp1(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 22dc72048e429ed257e9d7d1213b6cb7dcafbf1a..8070148f8b36dd7dab7711abaf25994acebc7e6f 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -16,9 +16,11 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import Program, program_guard
+import paddle.fluid.core as core
 
 
 class TestApiDataError(unittest.TestCase):
@@ -53,5 +55,49 @@ class TestApiDataError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_type)
 
 
+class TestApiStaticDataError(unittest.TestCase):
+    def test_fluid_dtype(self):
+        with program_guard(Program(), Program()):
+            x1 = paddle.static.data(name="x1", shape=[2, 25])
+            self.assertEqual(x1.dtype, core.VarDesc.VarType.FP32)
+
+            x2 = paddle.static.data(name="x2", shape=[2, 25], dtype="bool")
+            self.assertEqual(x2.dtype, core.VarDesc.VarType.BOOL)
+
+            paddle.set_default_dtype("float64")
+            x3 = paddle.static.data(name="x3", shape=[2, 25])
+            self.assertEqual(x3.dtype, core.VarDesc.VarType.FP64)
+
+    def test_fluid_data(self):
+        with program_guard(Program(), Program()):
+
+            # 1. The type of 'name' in fluid.data must be str.
+            def test_name_type():
+                paddle.static.data(name=1, shape=[2, 25], dtype="bool")
+
+            self.assertRaises(TypeError, test_name_type)
+
+            # 2. The type of 'shape' in fluid.data must be list or tuple.
+            def test_shape_type():
+                paddle.static.data(name='data1', shape=2, dtype="bool")
+
+            self.assertRaises(TypeError, test_shape_type)
+
+    def test_layers_data(self):
+        with program_guard(Program(), Program()):
+
+            # 1. The type of 'name' in layers.data must be str.
+            def test_name_type():
+                paddle.static.data(name=1, shape=[2, 25], dtype="bool")
+
+            self.assertRaises(TypeError, test_name_type)
+
+            # 2. The type of 'shape' in layers.data must be list or tuple.
+            def test_shape_type():
+                paddle.static.data(name='data1', shape=2, dtype="bool")
+
+            self.assertRaises(TypeError, test_shape_type)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
index c766cf17f422205521641ae44ab2060b4ab6e81c..cefef9ff9183e34d1ae7ae3e9a2f88969bf094a6 100644
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
@@ -271,7 +271,7 @@ class TestDataNormOpWithEnableScaleAndShift(OpTest):
         self.use_mkldnn = False
         epsilon = 0.00001
         slot_dim = -1
-        enable_scale_and_shitf = True
+        enable_scale_and_shift = True
         x_shape = [2, 50]
         scale_shape = [50]
         tp = np.float32
@@ -319,6 +319,63 @@ class TestDataNormOpWithEnableScaleAndShift(OpTest):
         self.check_grad(['X'], 'Y', no_grad_set=set([]))
 
 
+class TestDataNormOpWithoutEnableScaleAndShift(OpTest):
+    """
+    test class for data norm op
+    test forward and backward
+    """
+
+    def setUp(self):
+        """
+        init data norm op test env
+        """
+        self.op_type = 'data_norm'
+        self.use_mkldnn = False
+        epsilon = 0.00001
+        slot_dim = -1
+        enable_scale_and_shift = True
+        x_shape = [2, 50]
+        scale_shape = [50]
+        tp = np.float32
+
+        x_val = np.random.uniform(-1, 1, x_shape).astype(tp)
+        batch_size = np.ones(scale_shape).astype(tp)
+        batch_size *= 1e4
+        batch_sum = np.zeros(scale_shape).astype(tp)
+        batch_square_sum = np.ones(scale_shape).astype(tp)
+        batch_square_sum *= 1e4
+        scale_w = np.ones(scale_shape).astype(tp)
+        bias = np.zeros(scale_shape).astype(tp)
+
+        y = np.array(x_val)
+
+        mean = np.zeros(x_shape).astype(tp)
+        scale = np.ones(x_shape).astype(tp)
+
+        self.inputs = {
+            "X": x_val,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum,
+            "scale_w": scale_w,
+            "bias": bias
+        }
+        self.outputs = {"Y": y, "Means": mean, "Scales": scale}
+        self.attrs = {"epsilon": epsilon, "use_mkldnn": self.use_mkldnn}
+
+    def test_check_output(self):
+        """
+        test check forward, check output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        test check backward, check grad
+        """
+        self.check_grad(['X'], 'Y', no_grad_set=set([]))
+
+
 class TestDataNormOpWithEnableScaleAndShift_1(OpTest):
     """
     test class for data norm op
@@ -333,7 +390,7 @@ class TestDataNormOpWithEnableScaleAndShift_1(OpTest):
         self.use_mkldnn = False
         epsilon = 0.00001
         slot_dim = 1
-        enable_scale_and_shitf = True
+        enable_scale_and_shift = True
         x_shape = [2, 50]
         scale_shape = [50]
         tp = np.float32
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index cc2cee602918d53dd5435d9f498a9e8c9c948c58..582bb3dcc681921cdbf2111dcd26b299f06a3058 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -17,6 +17,7 @@ including create, config, run, etc.
 """
 
 from __future__ import print_function
+import paddle
 import paddle.fluid as fluid
 import paddle.compat as cpt
 import paddle.fluid.core as core
@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase):
     def test_dataset_create(self):
         """ Testcase for dataset create. """
         try:
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
         except:
             self.assertTrue(False)
 
         try:
-            dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "QueueDataset")
         except:
             self.assertTrue(False)
 
         try:
-            dataset = fluid.DatasetFactory().create_dataset(
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
                 "FileInstantDataset")
         except:
             self.assertTrue(False)
 
         try:
-            dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset")
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "MyOwnDataset")
             self.assertTrue(False)
         except:
             self.assertTrue(True)
@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(
@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase):
         dataset.set_trainer_num(4)
         dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
         dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi")
-        dataset.enable_pv_merge()
+        dataset.set_enable_pv_merge(False)
 
         thread_num = dataset.get_thread_num()
         self.assertEqual(thread_num, 12)
@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist([filename1, filename2])
@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist([
@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase):
                     name=slot, shape=[1], dtype="float32", lod_level=1)
                 slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(1)
         dataset.set_parse_ins_id(True)
@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase):
                 name="slot4", shape=[1], dtype="float32", lod_level=0)
             slots_vars = [var1, var2, var3, var4]
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(1)
         dataset.set_parse_ins_id(True)
@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist([
@@ -507,7 +517,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(
@@ -532,7 +543,8 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset2 = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset2.set_use_var(slots_vars)
         dataset2.set_batch_size(32)
         dataset2.set_thread(3)
@@ -573,7 +585,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(
@@ -628,7 +641,8 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[None, 1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_input_type(1)
         dataset.set_batch_size(1)
         dataset.set_thread(2)
@@ -707,7 +721,8 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
             inputs(list): inputs of get_dataset
             files(list): files of  get_dataset
         """
-        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "QueueDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist(files)
@@ -864,7 +879,8 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
             dataset.set_batch_size(32)
             dataset.set_thread(3)
             dataset.set_filelist([
@@ -884,9 +900,6 @@ class TestDataset2(unittest.TestCase):
         """
         Testcase for InMemoryDataset from create to run.
         """
-
-        self.skipTest("parameter server will add pslib UT later")
-
         with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -902,7 +915,7 @@ class TestDataset2(unittest.TestCase):
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
         with fluid.program_guard(train_program, startup_program):
             slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
             slots_vars = []
@@ -936,7 +949,8 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
             dataset.set_batch_size(32)
             dataset.set_thread(3)
             dataset.set_filelist([
@@ -952,6 +966,63 @@ class TestDataset2(unittest.TestCase):
                 print("warning: catch expected error")
             fleet._opt_info = None
             fleet._fleet_ptr = None
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "InMemoryDataset")
+            dataset.set_rank_offset("")
+            dataset.set_pv_batch_size(1)
+            dataset.set_hdfs_config("", "")
+            d = paddle.distributed.fleet.DatasetBase()
+            try:
+                dataset.set_feed_type("MultiSlotInMemoryDataFeed")
+            except:
+                print("warning: catch expected error")
+            dataset.thread_num = 0
+            try:
+                dataset._prepare_to_run()
+            except:
+                print("warning: catch expected error")
+            dataset.set_parse_logkey(True)
+            dataset.set_merge_by_sid(True)
+            dataset.set_enable_pv_merge(True)
+            try:
+                dataset.preprocess_instance()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.set_current_phase(1)
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.postprocess_instance()
+            except:
+                print("warning: catch expected error")
+            dataset.set_fleet_send_batch_size(1024)
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            dataset.get_pv_data_size()
+            dataset.get_memory_data_size()
+            dataset.get_shuffle_data_size()
+            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+                "QueueDataset")
+            try:
+                dataset.local_shuffle()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            dataset = paddle.distributed.fleet.FileInstantDataset()
+            try:
+                dataset.local_shuffle()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
 
         os.remove("./test_in_memory_dataset2_run2_a.txt")
         os.remove("./test_in_memory_dataset2_run2_b.txt")
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index 10aefbb222bb029c48648ce27ead4666b15dfc4d..c13c33f209f0f7d0fff95bdfb5b4e551a145b87e 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
 
     def check_batch_number(self, place, randomize_batch_num=False):
         main_prog, startup_prog, feeds = self.build_network()
-        dataset = fluid.DatasetFactory().create_dataset(self.dataset_name)
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            self.dataset_name)
         dataset.set_batch_size(BATCH_SIZE)
 
         if isinstance(place, fluid.CPUPlace):
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index f8cb6170be945ed628440b5a068f1acd0ac26503..a16f21c0f97c0902dd6c26561ed3f707b28ff947 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -122,14 +122,8 @@ class TestBase(unittest.TestCase):
                             label = item['label']
                             assert image.shape() == [BATCH_SIZE, 784]
                             assert label.shape() == [BATCH_SIZE, 1]
-                            if ps[i]._equals(fluid.CPUPlace()):
-                                assert image._place()._equals(fluid.CPUPlace())
-                                assert label._place()._equals(fluid.CPUPlace())
-                            else:
-                                assert image._place()._equals(
-                                    fluid.CUDAPinnedPlace())
-                                assert label._place()._equals(
-                                    fluid.CUDAPinnedPlace())
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
                         L, = exe.run(program=prog,
                                      feed=d,
                                      fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..057933fc7a735c2732cd651e83e99ddfa747b8a8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -0,0 +1,61 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from paddle.framework import set_default_dtype, get_default_dtype
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear
+import paddle.fluid.core as core
+from paddle import to_variable
+
+
+class TestDefaultType(unittest.TestCase):
+    def check_default(self):
+        self.assertEqual("float32", get_default_dtype())
+
+    def test_api(self):
+        self.check_default()
+
+        set_default_dtype("float64")
+        self.assertEqual("float64", get_default_dtype())
+
+        set_default_dtype("float32")
+        self.assertEqual("float32", get_default_dtype())
+
+        set_default_dtype("float16")
+        self.assertEqual("float16", get_default_dtype())
+
+        set_default_dtype(np.float64)
+        self.assertEqual("float64", get_default_dtype())
+
+        set_default_dtype(np.float32)
+        self.assertEqual("float32", get_default_dtype())
+
+        set_default_dtype(np.float16)
+        self.assertEqual("float16", get_default_dtype())
+
+
+class TestRaiseError(unittest.TestCase):
+    def test_error(self):
+        self.assertRaises(TypeError, set_default_dtype, "int32")
+        self.assertRaises(TypeError, set_default_dtype, np.int32)
+        self.assertRaises(TypeError, set_default_dtype, "int64")
+        self.assertRaises(TypeError, set_default_dtype, np.int64)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index 6b49de536ad390e0accd42bdc4f6967fd9369d5a..8b1cce5333eb4858e32eef5072be1c145723db90 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -170,7 +170,8 @@ def program_equal(a, b):
                         k))
                     return False
             assert (len(a.blocks) == len(b.blocks))
-
+        elif k == '_auto_checkpoint_name':
+            continue
         elif (v != b.__dict__[k]):
             raise ValueError("In program_equal not equal:{0}\n".format(k))
 
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ab56f9244f93266b90f3316bc2c2be5623e0ee7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from op_test import OpTest
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import warnings
+import paddle
+
+
+class TestStaticDeviceManage(unittest.TestCase):
+    def test_cpu_device(self):
+        paddle.set_device('cpu')
+        out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+        out2 = paddle.ones(shape=[1, 3], dtype='float32')
+        out3 = paddle.concat(x=[out1, out2], axis=0)
+        exe = paddle.fluid.Executor()
+        exe.run(paddle.fluid.default_startup_program())
+        res = exe.run(fetch_list=[out3])
+        device = paddle.get_device()
+        self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
+        self.assertEqual(device, "cpu")
+
+    def test_gpu_device(self):
+        if core.is_compiled_with_cuda():
+            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            out2 = paddle.ones(shape=[1, 3], dtype='float32')
+            out3 = paddle.concat(x=[out1, out2], axis=0)
+            paddle.set_device('gpu:0')
+            exe = paddle.fluid.Executor()
+            exe.run(paddle.fluid.default_startup_program())
+            res = exe.run(fetch_list=[out3])
+            device = paddle.get_device()
+            self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
+            self.assertEqual(device, "gpu:0")
+
+
+class TestImperativeDeviceManage(unittest.TestCase):
+    def test_cpu(self):
+        with fluid.dygraph.guard():
+            paddle.set_device('cpu')
+            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            out2 = paddle.ones(shape=[1, 3], dtype='float32')
+            out3 = paddle.concat(x=[out1, out2], axis=0)
+            device = paddle.get_device()
+            self.assertEqual(
+                isinstance(framework._current_expected_place(), core.CPUPlace),
+                True)
+            self.assertEqual(device, "cpu")
+
+    def test_gpu(self):
+        if core.is_compiled_with_cuda():
+            with fluid.dygraph.guard():
+                paddle.set_device('gpu:0')
+                out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+                out2 = paddle.ones(shape=[1, 3], dtype='float32')
+                out3 = paddle.concat(x=[out1, out2], axis=0)
+                device = paddle.get_device()
+                self.assertEqual(
+                    isinstance(framework._current_expected_place(),
+                               core.CUDAPlace), True)
+                self.assertEqual(device, "gpu:0")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index b6566676d2533aad5272fe61dbedbc1d55ea213b..8bf40459902e09f19a5badce62084841a0a23619 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -17,11 +17,173 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid import Program, program_guard
 
 
+class TestDiagV2Op(OpTest):
+    def setUp(self):
+        self.op_type = "diag_v2"
+        self.x = np.random.rand(10, 10)
+        self.offset = 0
+        self.padding_value = 0.0
+        self.out = np.diag(self.x, self.offset)
+
+        self.init_config()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'offset': self.offset,
+            'padding_value': self.padding_value
+        }
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_config(self):
+        pass
+
+
+class TestDiagV2OpCase1(TestDiagV2Op):
+    def init_config(self):
+        self.offset = 1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase2(TestDiagV2Op):
+    def init_config(self):
+        self.offset = -1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase3(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase4(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.rand(100)
+        self.padding_value = 8
+        n = self.x.size
+        self.out = self.padding_value * np.ones((n, n)) + np.diag(
+            self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
+
+
+class TestDiagV2Error(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_diag_v2_type():
+                x = [1, 2, 3]
+                output = paddle.diag(x)
+
+            self.assertRaises(TypeError, test_diag_v2_type)
+
+
+class TestDiagV2API(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
+        self.expected0 = np.diag(self.input_np)
+        self.expected1 = np.diag(self.input_np, k=1)
+        self.expected2 = np.diag(self.input_np, k=-1)
+
+        self.input_np2 = np.random.rand(100)
+        self.offset = 0
+        self.padding_value = 8
+        n = self.input_np2.size
+        self.expected3 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np2, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.input_np3 = np.random.randint(-10, 10, size=(100)).astype(np.int64)
+        self.padding_value = 8.0
+        n = self.input_np3.size
+        self.expected4 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.padding_value = -8
+        self.expected5 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = paddle.diag(x, padding_value=8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        x = paddle.to_tensor(self.input_np3)
+        y = paddle.diag(x, padding_value=8.0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected4))
+
+        y = paddle.diag(x, padding_value=-8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected5))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.data(name='input2', shape=[100], dtype='float64')
+        x3 = paddle.data(name='input3', shape=[100], dtype='int64')
+        result0 = paddle.diag(x)
+        result1 = paddle.diag(x, offset=1)
+        result2 = paddle.diag(x, offset=-1)
+        result3 = paddle.diag(x, name='aaa')
+        result4 = paddle.diag(x2, padding_value=8)
+        result5 = paddle.diag(x3, padding_value=8.0)
+        result6 = paddle.diag(x3, padding_value=-8)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        res0, res1, res2, res4, res5, res6 = exe.run(
+            feed={
+                "input": self.input_np,
+                "input2": self.input_np2,
+                'input3': self.input_np3
+            },
+            fetch_list=[result0, result1, result2, result4, result5, result6])
+
+        self.assertTrue(np.allclose(res0, self.expected0))
+        self.assertTrue(np.allclose(res1, self.expected1))
+        self.assertTrue(np.allclose(res2, self.expected2))
+        self.assertTrue('aaa' in result3.name)
+        self.assertTrue(np.allclose(res4, self.expected3))
+        self.assertTrue(np.allclose(res5, self.expected4))
+        self.assertTrue(np.allclose(res6, self.expected5))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+
 class TestDiagOp(OpTest):
     def setUp(self):
         self.op_type = "diag"
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc858828058079e7d54d3c753807725ce654a778
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -0,0 +1,181 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import time
+import subprocess
+import unittest
+import numpy as np
+import paddle
+
+
+class TestDirectory(unittest.TestCase):
+    def get_import_command(self, module):
+        paths = module.split('.')
+        if len(paths) == 1:
+            return 'import {}'.format(module)
+        package = '.'.join(paths[:-1])
+        func = paths[-1]
+        cmd = 'from {} import {}'.format(package, func)
+        return cmd
+
+    def test_new_directory(self):
+        new_directory = [
+            'paddle.enable_static', 'paddle.disable_static',
+            'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
+            'paddle.no_grad', 'paddle.save', 'paddle.load',
+            'paddle.static.save', 'paddle.static.load',
+            'paddle.BackwardStrategy', 'paddle.ParallelEnv',
+            'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
+            'paddle.jit.TracedLayer', 'paddle.jit.to_static',
+            'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
+            'paddle.NoamDecay', 'paddle.PiecewiseDecay',
+            'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
+            'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
+            'paddle.CosineDecay', 'paddle.static.Executor',
+            'paddle.static.global_scope', 'paddle.static.scope_guard',
+            'paddle.static.append_backward', 'paddle.static.gradients',
+            'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
+            'paddle.static.ExecutionStrategy',
+            'paddle.static.default_main_program',
+            'paddle.static.default_startup_program', 'paddle.static.Program',
+            'paddle.static.name_scope', 'paddle.static.program_guard',
+            'paddle.static.Print', 'paddle.static.py_func',
+            'paddle.static.ParallelExecutor',
+            'paddle.static.WeightNormParamAttr', 'paddle.static.nn.fc',
+            'paddle.static.nn.batch_norm',
+            'paddle.static.nn.bilinear_tensor_product',
+            'paddle.static.nn.conv2d', 'paddle.static.nn.conv2d_transpose',
+            'paddle.static.nn.conv3d', 'paddle.static.nn.conv3d_transpose',
+            'paddle.static.nn.create_parameter',
+            'paddle.static.nn.crf_decoding', 'paddle.static.nn.data_norm',
+            'paddle.static.nn.deformable_conv', 'paddle.static.nn.group_norm',
+            'paddle.static.nn.hsigmoid', 'paddle.static.nn.instance_norm',
+            'paddle.static.nn.layer_norm', 'paddle.static.nn.multi_box_head',
+            'paddle.static.nn.nce', 'paddle.static.nn.prelu',
+            'paddle.static.nn.row_conv', 'paddle.static.nn.spectral_norm',
+            'paddle.static.nn.embedding'
+        ]
+
+        import_file = 'run_import_modules.py'
+
+        with open(import_file, "w") as wb:
+            for module in new_directory:
+                run_cmd = self.get_import_command(module)
+                wb.write("{}\n".format(run_cmd))
+
+        _python = sys.executable
+
+        ps_cmd = "{} {}".format(_python, import_file)
+        ps_proc = subprocess.Popen(
+            ps_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        stdout, stderr = ps_proc.communicate()
+
+        assert "Error" not in str(stderr), "Error: Can't" \
+            " import Module {}".format(module)
+
+    def test_old_directory(self):
+        old_directory = [
+            'paddle.enable_imperative', 'paddle.disable_imperative',
+            'paddle.in_imperative_mode', 'paddle.imperative.to_variable',
+            'paddle.imperative.enable', 'paddle.imperative.guard',
+            'paddle.imperative.grad', 'paddle.imperative.no_grad',
+            'paddle.imperative.save', 'paddle.imperative.load',
+            'paddle.imperative.BackwardStrategy',
+            'paddle.imperative.ParallelEnv',
+            'paddle.imperative.prepare_context',
+            'paddle.imperative.DataParalell', 'paddle.imperative.jit',
+            'paddle.imperative.TracedLayer', 'paddle.imperative.declarative',
+            'paddle.imperative.ProgramTranslator',
+            'paddle.imperative.TranslatedLayer', 'paddle.imperative.jit.save',
+            'paddle.imperative.jit.load',
+            'paddle.imperative.jit.SaveLoadConfig',
+            'paddle.imperative.NoamDecay'
+            'paddle.imperative.PiecewiseDecay',
+            'paddle.imperative.NaturalExpDecay',
+            'paddle.imperative.ExponentialDecay',
+            'paddle.imperative.InverseTimeDecay',
+            'paddle.imperative.PolynomialDecay',
+            'paddle.imperative.CosineDecay', 'paddle.Executor',
+            'paddle.global_scope', 'paddle.scope_guard',
+            'paddle.append_backward', 'paddle.gradients',
+            'paddle.BuildStrategy', 'paddle.CompiledProgram',
+            'paddle.ExecutionStrategy', 'paddle.name_scope',
+            'paddle.program_guard', 'paddle.Print', 'paddle.py_func',
+            'paddle.ParallelExecutor', 'paddle.default_main_program',
+            'paddle.default_startup_program', 'paddle.Program',
+            'paddle.WeightNormParamAttr', 'paddle.declarative.fc',
+            'paddle.declarative.batch_norm',
+            'paddle.declarative.bilinear_tensor_product',
+            'paddle.declarative.conv2d', 'paddle.declarative.conv2d_transpose',
+            'paddle.declarative.conv3d', 'paddle.declarative.conv3d_transpose',
+            'paddle.declarative.create_parameter',
+            'paddle.declarative.crf_decoding', 'paddle.declarative.data_norm',
+            'paddle.declarative.deformable_conv',
+            'paddle.declarative.group_norm', 'paddle.declarative.hsigmoid',
+            'paddle.declarative.instance_norm', 'paddle.declarative.layer_norm',
+            'paddle.declarative.multi_box_head', 'paddle.declarative.nce',
+            'paddle.declarative.prelu', 'paddle.declarative.row_conv',
+            'paddle.declarative.spectral_norm', 'paddle.declarative.embedding'
+        ]
+
+        import_file = 'run_old_import_modules.py'
+
+        with open(import_file, "w") as wb:
+            cmd_context_count = """
+count = 0
+err_module = ""
+"""
+            wb.write(cmd_context_count)
+            for module in old_directory:
+                run_cmd = self.get_import_command(module)
+                cmd_context_loop_template = """
+try:
+    {run_cmd}
+except:
+    count += 1
+else:
+    err_module = "{module}"
+"""
+                cmd_context_loop = cmd_context_loop_template.format(
+                    run_cmd=run_cmd, module=module)
+                wb.write(cmd_context_loop)
+            cmd_context_print_template = """
+if count != {len_old_directory}:
+    print("Error: Module " + err_module + " should not be imported")
+"""
+            cmd_context_print = cmd_context_print_template.format(
+                len_old_directory=str(len(old_directory)))
+            wb.write(cmd_context_print)
+
+        _python = sys.executable
+
+        ps_cmd = "{} {}".format(_python, import_file)
+        ps_proc = subprocess.Popen(
+            ps_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        stdout, stderr = ps_proc.communicate()
+
+        assert "Error" not in str(stdout), str(stdout)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df55a6b873e28a6e479fd05b31074802eb19bb7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import unittest
+
+import paddle
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer_trainer(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier")
+
+        sends = 0
+        sgds = 0
+        for op in prog.global_block().ops:
+            if op.type == "send":
+                sends += 1
+            if op.type == "sgd":
+                sgds += 1
+        self.assertEqual(sends, 7)
+        self.assertEqual(sgds, 0)
+
+        fleet.init_worker()
+        time.sleep(8)
+        fleet.stop_worker()
+
+    def test_a_sync_optimizer_pserver(self):
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
+        fleet.init_server()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ca41a11e325cfb66a3a3eaadb4eca6f9764212
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer_trainer(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[-1].type, "send")
+
+        sends = 0
+        sgds = 0
+
+        for op in prog.global_block().ops:
+            if op.type == "send":
+                sends += 1
+            if op.type == "sgd":
+                sgds += 1
+        self.assertEqual(sends, 1)
+        self.assertEqual(sgds, 6)
+
+    def test_a_sync_optimizer_pserver(self):
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0993e022e1b9570773634ec829b088c5ff145ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "6007"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_gradient_merge_optimizer(self):
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        prog = paddle.fluid.default_main_program()
+        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")
+
+        sends = 0
+        sgds = 0
+        for op in prog.global_block().ops:
+            if op.type == "send":
+                sends += 1
+            if op.type == "sgd":
+                sgds += 1
+        self.assertEqual(sends, 6)
+        self.assertEqual(sgds, 0)
+
+        fleet.init_worker()
+        time.sleep(8)
+        fleet.stop_worker()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 16f0fc0a35e6140941da09c13bf67855670fc6a1..beb0069eb770f25d7834749ff9c188e5252e13c0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -21,6 +21,9 @@ import os
 import sys
 import subprocess
 
+import six
+import shutil
+import numpy as np
 import argparse
 from contextlib import closing
 import socket
@@ -28,9 +31,11 @@ import time
 import tempfile
 import unittest
 
+import paddle
 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.base.util_factory import fleet_util
+from paddle.distributed.fleet import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']
@@ -48,36 +53,46 @@ class FleetDistRunnerBase(object):
     """
 
     def build_role(self, args):
+
         if args.role.upper() == "PSERVER":
             role = role_maker.UserDefinedRoleMaker(
+                is_collective=False,
+                init_gloo=False,
+                path=args.gloo_path,
                 current_id=args.current_id,
                 role=role_maker.Role.SERVER,
-                worker_num=args.trainers,
+                worker_endpoints=args.trainer_endpoints.split(","),
                 server_endpoints=args.endpoints.split(","))
         else:
             role = role_maker.UserDefinedRoleMaker(
+                is_collective=False,
+                init_gloo=False,
+                path=args.gloo_path,
                 current_id=args.current_id,
                 role=role_maker.Role.WORKER,
-                worker_num=args.trainers,
+                worker_endpoints=args.trainer_endpoints.split(","),
                 server_endpoints=args.endpoints.split(","))
+        self.role = role
         return role
 
     def build_strategy(self, args):
-        self.strategy = None
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = False
         if args.mode == "async":
-            self.strategy = StrategyFactory.create_async_strategy()
-        elif args.mode == "sync":
-            self.strategy = StrategyFactory.create_sync_strategy()
-        elif args.mode == "half_async":
-            self.strategy = StrategyFactory.create_half_async_strategy()
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = True
         elif args.mode == "geo":
-            self.strategy = StrategyFactory.create_geo_strategy(
-                args.geo_sgd_need_push_nums)
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = True
+            self.strategy.a_sync_configs = {
+                "k_steps": args.geo_sgd_need_push_nums
+            }
         self.dump_param = os.getenv("dump_param", "").split(",")
         self.dump_fields = os.getenv("dump_fields", "").split(",")
         self.dump_fields_path = os.getenv("dump_fields_path", "")
         debug = int(os.getenv("Debug", "0"))
-        if debug:
+        # TODO(update strategy to support dump params)
+        if False:  #debug:
             self.strategy.set_debug_opt({
                 "dump_param": self.dump_param,
                 "dump_fields": self.dump_fields,
@@ -110,30 +125,17 @@ class FleetDistRunnerBase(object):
                     staircase=True))
         else:
             optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
     def run_pserver(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
-
         fleet.init_server()
         fleet.run_server()
 
     def run_dataset_trainer(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
         out = self.do_dataset_training(fleet)
 
     def run_pyreader_trainer(self, args):
-        fleet.init(self.build_role(args))
-        strategy = self.build_strategy(args)
-        avg_cost = self.net(args)
-        self.build_optimizer(avg_cost, strategy)
         out = self.do_pyreader_training(fleet)
 
     def net(self, args, batch_size=4, lr=0.01):
@@ -158,7 +160,13 @@ class TestFleetBase(unittest.TestCase):
     def _setup_config(self):
         raise NotImplementedError("tests should have _setup_config implemented")
 
+    def tearDown(self):
+        t = time.time() - self.startTime
+        print('%s: %.3f' % (self.__class__.__name__, t))
+
     def setUp(self):
+        self.startTime = time.time()
+
         self._mode = "sync"
         self._reader = "pyreader"
         self._trainers = 2
@@ -173,10 +181,14 @@ class TestFleetBase(unittest.TestCase):
             print("set begin_port:", DIST_UT_PORT)
             self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
                 DIST_UT_PORT, DIST_UT_PORT + 1)
-            DIST_UT_PORT += 2
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                DIST_UT_PORT + 2, DIST_UT_PORT + 3)
+            DIST_UT_PORT += 4
         else:
             self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
                 self._find_free_port(), self._find_free_port())
+            self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+                self._find_free_port(), self._find_free_port())
 
         self._python_interp = sys.executable
         self._geo_sgd_need_push_nums = 5
@@ -236,18 +248,22 @@ class TestFleetBase(unittest.TestCase):
     def _run_cluster(self, model, envs):
         env = {'GRAD_CLIP': str(self._grad_clip_mode)}
         python_path = self._python_interp
+        gloo_path = tempfile.mkdtemp()
+
         if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
             envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
             python_path += " -m coverage run --branch -p"
         env.update(envs)
 
-        tr_cmd = "{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
-            python_path, model, self._ps_endpoints, self._trainers, self._mode,
-            self._geo_sgd_need_push_nums, self._reader)
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path)
 
-        ps_cmd = "{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
-            python_path, model, self._ps_endpoints, self._trainers, self._mode,
-            self._geo_sgd_need_push_nums, self._reader)
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+            python_path, model, self._ps_endpoints, self._tr_endpoints,
+            self._trainers, self._mode, self._geo_sgd_need_push_nums,
+            self._reader, gloo_path)
 
         # Run dist train to compare with local results
         ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
@@ -271,6 +287,23 @@ class TestFleetBase(unittest.TestCase):
 
         tr0_ret = tr0.returncode
         tr1_ret = tr0.returncode
+        if tr0_ret != 0:
+            print(
+                "========================Error tr0_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
+            print(
+                "========================Error tr0_err end==========================="
+            )
+
+        if tr1_ret != 0:
+            print(
+                "========================Error tr1_err begin==========================="
+            )
+            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
+            print(
+                "========================Error tr1_err end==========================="
+            )
 
         self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
         self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
@@ -284,6 +317,7 @@ class TestFleetBase(unittest.TestCase):
         ps0.terminate()
         ps1.terminate()
 
+        shutil.rmtree(gloo_path)
         return 0, 0
 
     def check_with_place(self,
@@ -313,6 +347,9 @@ def runtime_main(test_class):
     parser.add_argument(
         '--role', type=str, required=True, choices=['pserver', 'trainer'])
     parser.add_argument('--endpoints', type=str, required=False, default="")
+    parser.add_argument(
+        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument('--gloo_path', type=str, required=False, default="")
     parser.add_argument('--current_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument('--mode', type=str, required=False, default='geo')
@@ -322,6 +359,13 @@ def runtime_main(test_class):
     args = parser.parse_args()
 
     model = test_class()
+    role = model.build_role(args)
+    fleet.init(role)
+    strategy = model.build_strategy(args)
+    avg_cost = model.net(args)
+    model.build_optimizer(avg_cost, strategy)
+    fleet_util._set_strategy(strategy)
+    fleet_util._set_role_maker(role)
     if args.role == "pserver":
         model.run_pserver(args)
     else:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 5fc37335b21536cef160c9f72e68bf7eb0610e97..b506f179143412e2bdb5d9eda511d90a0a3eea6d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -123,7 +123,7 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
 
 class TestDistCtrHalfAsync2x2(TestFleetBase):
     def _setup_config(self):
-        self._mode = "half_async"
+        self._mode = "async"
         self._reader = "pyreader"
 
     def check_with_place(self,
@@ -156,5 +156,40 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
+class TestDistCtrPsGpuPyreaderAsync2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
+            "http_proxy": "",
+            "FLAGS_communicator_send_queue_size": "2",
+            "FLAGS_communicator_max_merge_var_num": "2",
+            "CPU_NUM": "2",
+            "SAVE_MODEL": "1"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr_ps_gpu.py", delta=1e-5, check_error_log=True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bc0d8dadce44c8f711189466f34fb5cd76f39f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+import unittest
+import subprocess
+import time
+import paddle.fluid as fluid
+#import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+from test_dist_fleet_base import TestFleetBase
+
+#from dist_simnet_bow import train_network
+
+
+class TestDistGloo_2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "sync"
+        self._reader = "pyreader"
+        self._path = "./tmp4"
+        if (os.path.exists(self._path)):
+            shutil.rmtree(self._path)
+        # if not os.path.exists(self._path):
+        #      os.mkdir(self._path)
+
+    def _start_pserver(self, cmd, required_envs):
+        #env.update(required_envs)
+        ps0_cmd = cmd
+        ps1_cmd = cmd
+
+        ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
+        ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
+
+        required_envs["POD_IP"] = "127.0.0.1"
+        required_envs["PADDLE_PSERVER_ID"] = "0"
+        required_envs["PADDLE_PORT"] = "36011"
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps0_pipe,
+            env=required_envs)
+        print("PADDLE_PSERVER_ID=0:")
+        print(required_envs)
+        required_envs["PADDLE_PSERVER_ID"] = "1"
+        required_envs["PADDLE_PORT"] = "36012"
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=ps1_pipe,
+            env=required_envs)
+        print("PADDLE_PSERVER_ID=1:")
+        print(required_envs)
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+    def _start_trainer(self, cmd, required_envs):
+        #env.update(required_envs)
+
+        tr0_cmd = cmd
+        tr1_cmd = cmd
+
+        tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
+        tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
+        required_envs["PADDLE_TRAINER_ID"] = "0"
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=required_envs)
+        print("PADDLE_TRAINER_ID=0:")
+        print(required_envs)
+        required_envs["PADDLE_TRAINER_ID"] = "1"
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=required_envs)
+        print("PADDLE_TRAINER_ID=1:")
+        print(required_envs)
+        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
+
+    def _run_cluster(self, model, envs):
+        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        python_path = self._python_interp
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
+            python_path += " -m coverage run --branch -p"
+        env.update(envs)
+
+        tr_cmd = "{0} {1}".format(python_path, model)
+
+        ps_cmd = "{0} {1}".format(python_path, model)
+
+        # Run dist train to compare with local results
+        env["TRAINING_ROLE"] = "PSERVER"
+        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
+        print(ps_cmd)
+        env["TRAINING_ROLE"] = "TRAINER"
+        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
+
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+
+        while True:
+            stat1 = tr1.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
+        tr0_out, tr0_err = tr0.communicate()
+        tr1_out, tr1_err = tr1.communicate()
+
+        tr0_ret = tr0.returncode
+        tr1_ret = tr0.returncode
+
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        ps0_pipe.close()
+        ps1_pipe.close()
+
+        ps0.terminate()
+        ps1.terminate()
+
+        return 0, 0
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2",
+            #PSERVER
+            "PADDLE_PSERVERS_IP_PORT_LIST": "127.0.0.1:36011,127.0.0.1:36012",
+            #"PADDLE_PSERVER_PORT_ARRAY":"(36011 36012)",
+            "PADDLE_PSERVER_NUMS": "2",
+            "PADDLE_TRAINER_ID": "0",
+            #TRAINER
+            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36013,127.0.0.1:36014",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_PSERVER_ID": "0",
+            #GLOO FLAG
+            "PADDLE_WITH_GLOO": "1",
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        print("path is not delete", os.path.exists("./tmp4"))
+        self.check_with_place(
+            "dist_fleet_debug_gloo.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index 833b7307fa317b171e3acbd3a508a1c8a8da3d94..e7b10be2349cce755267297025ca8520b6d494ee 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -19,10 +19,10 @@ import unittest
 import tempfile
 import shutil
 
+import paddle
 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
 
 # For Net
 base_lr = 0.2
@@ -149,40 +149,41 @@ class TestPSPassWithBow(unittest.TestCase):
         return [avg_cost, acc, cos_q_pt]
 
     def test(self):
-        endpoints = ["127.0.0.1:36004"]
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
-
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         loss, acc, _ = self.net()
-        optimizer = fluid.optimizer.SGD(base_lr)
-        strategy = StrategyFactory.create_async_strategy()
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(loss)
 
-        fleet.startup_program_bak = fleet.startup_program
-        fleet.startup_program = None
+        model_dir = tempfile.mkdtemp()
 
         with self.assertRaises(ValueError):
-            fleet.init_server()
-
-        model_dir = tempfile.mkdtemp()
+            fleet.init_server(os.path.join(model_dir, "temp"), "xxxx")
 
         with self.assertRaises(ValueError):
             fleet.init_server(os.path.join(model_dir, "temp"))
 
-        fleet.startup_program = fleet.startup_program_bak
         fleet.init_server()
 
         from paddle.fluid.communicator import LargeScaleKV
         kv = LargeScaleKV()
-        kv.save("__emb__", os.path.join(model_dir, "__emb__", "__emb__"))
-
-        fleet.main_program = fluid.Program()
+        kv.save("__emb__.block0",
+                os.path.join(model_dir, "__emb__", "__emb__.block0"))
+        fluid.framework.switch_main_program(fluid.Program())
         fleet.init_server(model_dir)
         shutil.rmtree(model_dir)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 761d57408b9a8f9e52419331bfb0bca5b0135c30..1062123948481a4164a12a4bed818b964923006f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
                          ["listen_and_serv"])
         # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
-            "scale"
+            "sum", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "scale"
         ])
 
         # block1~2: optimize pass
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccaa3266e087a38ac38667c62487e69c6bb6bf6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -0,0 +1,725 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import paddle
+from paddle import fluid
+from paddle.fluid import layers
+from paddle.distribution import *
+import math
+
+
+class DistributionNumpy():
+    def sample(self):
+        raise NotImplementedError
+
+    def entropy(self):
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        raise NotImplementedError
+
+    def probs(self, value):
+        raise NotImplementedError
+
+
+class UniformNumpy(DistributionNumpy):
+    def __init__(self, low, high):
+        self.low = np.array(low).astype('float32')
+        self.high = np.array(high).astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.low + self.high).shape
+        return self.low + (np.random.uniform(size=shape) *
+                           (self.high - self.low))
+
+    def log_prob(self, value):
+        lb = np.less(self.low, value).astype('float32')
+        ub = np.less(value, self.high).astype('float32')
+        return np.log(lb * ub) - np.log(self.high - self.low)
+
+    def probs(self, value):
+        lb = np.less(self.low, value).astype('float32')
+        ub = np.less(value, self.high).astype('float32')
+        return (lb * ub) / (self.high - self.low)
+
+    def entropy(self):
+        return np.log(self.high - self.low)
+
+
+class NormalNumpy(DistributionNumpy):
+    def __init__(self, loc, scale):
+        self.loc = np.array(loc).astype('float32')
+        self.scale = np.array(scale).astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.loc + self.scale).shape
+        return self.loc + (np.random.randn(*shape) * self.scale)
+
+    def log_prob(self, value):
+        var = self.scale * self.scale
+        log_scale = np.log(self.scale)
+        return -((value - self.loc) * (value - self.loc)) / (
+            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+
+    def probs(self, value):
+        var = self.scale * self.scale
+        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
+
+    def entropy(self):
+        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
+            'float32')) + np.log(self.scale)
+
+    def kl_divergence(self, other):
+        var_ratio = (self.scale / other.scale)
+        var_ratio = var_ratio * var_ratio
+        t1 = ((self.loc - other.loc) / other.scale)
+        t1 = (t1 * t1)
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
+
+
+class DistributionTest(unittest.TestCase):
+    def setUp(self, use_gpu=False):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+        self.executor = fluid.Executor(place)
+
+    def build_normal_common_net(self, batch_size, dims, loc_float, scale_float,
+                                other_loc_float, other_scale_float, scale_np,
+                                other_scale_np, loc_np, other_loc_np, loc,
+                                scale, other_loc, other_scale, values):
+        normal_int = Normal(int(loc_float), int(scale_float))
+        normal_float = Normal(loc_float, scale_float)
+        other_normal_float = Normal(other_loc_float, other_scale_float)
+
+        normal_float_np_broadcast = Normal(loc_float, scale_np)
+        other_normal_float_np_broadcast = Normal(other_loc_float,
+                                                 other_scale_np)
+
+        normal_np = Normal(loc_np, scale_np)
+        other_normal_np = Normal(other_loc_np, other_scale_np)
+
+        normal_variable = Normal(loc, scale)
+        other_normal_variable = Normal(other_loc, other_scale)
+
+        sample_int = normal_int.sample([batch_size, dims])
+        sample_float = normal_float.sample([batch_size, dims])
+        sample_float_np_broadcast = normal_float_np_broadcast.sample(
+            [batch_size, dims])
+        sample_np = normal_np.sample([batch_size, dims])
+        sample_variable = normal_variable.sample([batch_size, dims])
+
+        entropy_int = normal_int.entropy()
+        entropy_float = normal_float.entropy()
+        entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
+        entropy_np = normal_np.entropy()
+        entropy_variable = normal_variable.entropy()
+
+        lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
+        lp_np = normal_np.log_prob(values)
+        lp_variable = normal_variable.log_prob(values)
+
+        p_float_np_broadcast = normal_float_np_broadcast.probs(values)
+        p_np = normal_np.probs(values)
+        p_variable = normal_variable.probs(values)
+
+        kl_float = normal_float.kl_divergence(other_normal_float)
+        kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
+            other_normal_float_np_broadcast)
+        kl_np = normal_np.kl_divergence(other_normal_np)
+        kl_variable = normal_variable.kl_divergence(other_normal_variable)
+
+        fetch_list = [
+            sample_int, sample_float, sample_float_np_broadcast, sample_np,
+            sample_variable, entropy_int, entropy_float,
+            entropy_float_np_broadcast, entropy_np, entropy_variable,
+            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
+            p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
+            kl_variable
+        ]
+        return fetch_list
+
+    def build_normal_static(self, test_program, batch_size, dims, loc_float,
+                            scale_float, other_loc_float, other_scale_float,
+                            scale_np, other_scale_np, loc_np, other_loc_np,
+                            values_np):
+        with fluid.program_guard(test_program):
+            loc = layers.data(name='loc', shape=[dims], dtype='float32')
+            scale = layers.data(name='scale', shape=[dims], dtype='float32')
+
+            other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+            values = layers.data(name='values', shape=[dims], dtype='float32')
+
+            fetch_list = self.build_normal_common_net(
+                batch_size, dims, loc_float, scale_float, other_loc_float,
+                other_scale_float, scale_np, other_scale_np, loc_np,
+                other_loc_np, loc, scale, other_loc, other_scale, values)
+
+        feed_vars = {
+            'loc': loc_np,
+            'scale': scale_np,
+            'other_loc': other_loc_np,
+            'other_scale': other_scale_np,
+            'values': values_np
+        }
+        return feed_vars, fetch_list
+
+    def build_normal_dygraph(self, batch_size, dims, loc_float, scale_float,
+                             other_loc_float, other_scale_float, scale_np,
+                             other_scale_np, loc_np, other_loc_np, values_np):
+        loc = paddle.to_tensor(loc_np)
+        scale = paddle.to_tensor(scale_np)
+        other_loc = paddle.to_tensor(other_loc_np)
+        other_scale = paddle.to_tensor(other_scale_np)
+        values = paddle.to_tensor(values_np)
+
+        fetch_list = self.build_normal_common_net(
+            batch_size, dims, loc_float, scale_float, other_loc_float,
+            other_scale_float, scale_np, other_scale_np, loc_np, other_loc_np,
+            loc, scale, other_loc, other_scale, values)
+        fetch_list_numpy = [t.numpy() for t in fetch_list]
+        return fetch_list_numpy
+
+    def get_normal_random_input(self, batch_size, dims):
+        loc_np = np.random.randn(batch_size, dims).astype('float32')
+        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+
+        loc_float = (np.random.ranf() - 0.5) * 4
+        scale_float = (np.random.ranf() - 0.5) * 4
+        while scale_float < 0:
+            scale_float = (np.random.ranf() - 0.5) * 4
+
+        other_loc_float = (np.random.ranf() - 0.5) * 4
+        other_scale_float = (np.random.ranf() - 0.5) * 4
+        while other_scale_float < 0:
+            other_scale_float = (np.random.ranf() - 0.5) * 4
+
+        scale_np = np.random.randn(batch_size, dims).astype('float32')
+        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        while not np.all(scale_np > 0):
+            scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(other_scale_np > 0):
+            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
+        return [
+            loc_np, other_loc_np, loc_float, scale_float, other_loc_float,
+            other_scale_float, scale_np, other_scale_np, values_np
+        ]
+
+    def compare_normal_with_numpy(self,
+                                  data_list,
+                                  output_list,
+                                  batch_size=2,
+                                  dims=3,
+                                  tolerance=1e-6):
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
+        np_normal_float = NormalNumpy(loc_float, scale_float)
+        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
+        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
+        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
+                                                         other_scale_np)
+        np_normal = NormalNumpy(loc_np, scale_np)
+        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
+
+        gt_sample_int = np_normal_int.sample([batch_size, dims])
+        gt_sample_float = np_normal_float.sample([batch_size, dims])
+        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
+            [batch_size, dims])
+        gt_sample_np = np_normal.sample([batch_size, dims])
+        gt_entropy_int = np_normal_int.entropy()
+        gt_entropy_float = np_normal_float.entropy()
+        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
+        gt_entropy = np_normal.entropy()
+        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
+            values_np)
+        gt_lp = np_normal.log_prob(values_np)
+        gt_p_float_np_broadcast = np_normal_float_np_broadcast.probs(values_np)
+        gt_p = np_normal.probs(values_np)
+        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
+        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
+            np_other_normal_float_np_broadcast)
+        gt_kl = np_normal.kl_divergence(np_other_normal)
+
+        [
+            output_sample_int, output_sample_float,
+            output_sample_float_np_broadcast, output_sample_np,
+            output_sample_variable, output_entropy_int, output_entropy_float,
+            output_entropy_float_np_broadcast, output_entropy_np,
+            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
+            output_lp_variable, output_p_float_np_broadcast, output_p_np,
+            output_p_variable, output_kl_float, output_kl_float_np_broadcast,
+            output_kl_np, output_kl_variable
+        ] = output_list
+
+        np.testing.assert_allclose(
+            output_sample_int.shape,
+            gt_sample_int.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_float.shape,
+            gt_sample_float.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_float_np_broadcast.shape,
+            gt_sample_float_np_broadcast.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_np.shape,
+            gt_sample_np.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_variable.shape,
+            gt_sample_np.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float,
+            gt_entropy_float,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float_np_broadcast,
+            gt_entropy_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_float_np_broadcast,
+            gt_lp_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_float_np_broadcast,
+            gt_p_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_float_np_broadcast,
+            gt_kl_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
+
+    def test_normal_distribution_static(self,
+                                        batch_size=2,
+                                        dims=3,
+                                        tolerance=1e-6):
+        test_program = fluid.Program()
+        data_list = self.get_normal_random_input(batch_size, dims)
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        feed_vars, fetch_list = self.build_normal_static(
+            test_program, batch_size, dims, loc_float, scale_float,
+            other_loc_float, other_scale_float, scale_np, other_scale_np,
+            loc_np, other_loc_np, values_np)
+        self.executor.run(fluid.default_startup_program())
+
+        output_list = self.executor.run(program=test_program,
+                                        feed=feed_vars,
+                                        fetch_list=fetch_list)
+
+        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
+                                       tolerance)
+
+    def test_normal_distribution_dygraph(self,
+                                         batch_size=2,
+                                         dims=3,
+                                         tolerance=1e-6):
+        paddle.disable_static()
+        data_list = self.get_normal_random_input(batch_size, dims)
+        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
+
+        output_list = self.build_normal_dygraph(
+            batch_size, dims, loc_float, scale_float, other_loc_float,
+            other_scale_float, scale_np, other_scale_np, loc_np, other_loc_np,
+            values_np)
+
+        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
+                                       tolerance)
+        paddle.enable_static()
+
+    def build_uniform_common_net(self, batch_size, dims, low_float, high_float,
+                                 high_np, low_np, values_np, low, high, values):
+        uniform_int = Uniform(int(low_float), int(high_float))
+        uniform_float = Uniform(low_float, high_float)
+        uniform_float_np_broadcast = Uniform(low_float, high_np)
+        uniform_np = Uniform(low_np, high_np)
+        uniform_variable = Uniform(low, high)
+
+        sample_int = uniform_int.sample([batch_size, dims])
+        sample_float = uniform_float.sample([batch_size, dims])
+        sample_float_np_broadcast = uniform_float_np_broadcast.sample(
+            [batch_size, dims])
+        sample_np = uniform_np.sample([batch_size, dims])
+        sample_variable = uniform_variable.sample([batch_size, dims])
+
+        entropy_int = uniform_int.entropy()
+        entropy_float = uniform_float.entropy()
+        entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
+        entropy_np = uniform_np.entropy()
+        entropy_variable = uniform_variable.entropy()
+
+        lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
+        lp_np = uniform_np.log_prob(values)
+        lp_variable = uniform_variable.log_prob(values)
+
+        p_float_np_broadcast = uniform_float_np_broadcast.probs(values)
+        p_np = uniform_np.probs(values)
+        p_variable = uniform_variable.probs(values)
+
+        fetch_list = [
+            sample_int, sample_float, sample_float_np_broadcast, sample_np,
+            sample_variable, entropy_int, entropy_float,
+            entropy_float_np_broadcast, entropy_np, entropy_variable,
+            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
+            p_np, p_variable
+        ]
+        return fetch_list
+
+    def build_uniform_static(self, test_program, batch_size, dims, low_float,
+                             high_float, high_np, low_np, values_np):
+        with fluid.program_guard(test_program):
+            low = layers.data(name='low', shape=[dims], dtype='float32')
+            high = layers.data(name='high', shape=[dims], dtype='float32')
+
+            values = layers.data(name='values', shape=[dims], dtype='float32')
+
+            fetch_list = self.build_uniform_common_net(
+                batch_size, dims, low_float, high_float, high_np, low_np,
+                values_np, low, high, values)
+
+        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
+        return feed_vars, fetch_list
+
+    def build_uniform_dygraph(self, batch_size, dims, low_float, high_float,
+                              high_np, low_np, values_np):
+        low = paddle.to_tensor(low_np)
+        high = paddle.to_tensor(high_np)
+        values = paddle.to_tensor(values_np)
+
+        fetch_list = self.build_uniform_common_net(batch_size, dims, low_float,
+                                                   high_float, high_np, low_np,
+                                                   values_np, low, high, values)
+        fetch_list_numpy = [t.numpy() for t in fetch_list]
+        return fetch_list_numpy
+
+    def compare_uniform_with_numpy(self,
+                                   data_list,
+                                   output_list,
+                                   batch_size=2,
+                                   dims=3,
+                                   tolerance=1e-6):
+        [low_np, low_float, high_float, high_np, values_np] = data_list
+
+        np_uniform_int = UniformNumpy(int(low_float), int(high_float))
+        np_uniform_float = UniformNumpy(low_float, high_float)
+        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
+        np_uniform = UniformNumpy(low_np, high_np)
+
+        gt_sample_int = np_uniform_int.sample([batch_size, dims])
+        gt_sample_float = np_uniform_float.sample([batch_size, dims])
+        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
+            [batch_size, dims])
+        gt_sample_np = np_uniform.sample([batch_size, dims])
+        gt_entropy_int = np_uniform_int.entropy()
+        gt_entropy_float = np_uniform_float.entropy()
+        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
+        gt_entropy = np_uniform.entropy()
+        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
+            values_np)
+        gt_lp = np_uniform.log_prob(values_np)
+        gt_p_float_np_broadcast = np_uniform_float_np_broadcast.probs(values_np)
+        gt_p = np_uniform.probs(values_np)
+
+        [
+            output_sample_int, output_sample_float,
+            output_sample_float_np_broadcast, output_sample_np,
+            output_sample_variable, output_entropy_int, output_entropy_float,
+            output_entropy_float_np_broadcast, output_entropy_np,
+            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
+            output_lp_variable, output_p_float_np_broadcast, output_p_np,
+            output_p_variable
+        ] = output_list
+
+        np.testing.assert_allclose(
+            output_sample_int.shape,
+            gt_sample_int.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_float.shape,
+            gt_sample_float.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_float_np_broadcast.shape,
+            gt_sample_float_np_broadcast.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_np.shape,
+            gt_sample_np.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_sample_variable.shape,
+            gt_sample_np.shape,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float,
+            gt_entropy_float,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_float_np_broadcast,
+            gt_entropy_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_float_np_broadcast,
+            gt_lp_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_float_np_broadcast,
+            gt_p_float_np_broadcast,
+            rtol=tolerance,
+            atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
+
+    def test_uniform_distribution_static(self,
+                                         batch_size=2,
+                                         dims=3,
+                                         tolerance=1e-6):
+        test_program = fluid.Program()
+
+        low_np = np.random.randn(batch_size, dims).astype('float32')
+        low_float = np.random.uniform(-2, 1)
+        high_float = np.random.uniform(1, 3)
+        high_np = np.random.uniform(-5.0, 5.0,
+                                    (batch_size, dims)).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        data_list = [low_np, low_float, high_float, high_np, values_np]
+
+        feed_vars, fetch_list = self.build_uniform_static(
+            test_program, batch_size, dims, low_float, high_float, high_np,
+            low_np, values_np)
+
+        self.executor.run(fluid.default_startup_program())
+
+        # result calculated by paddle
+        output_list = self.executor.run(program=test_program,
+                                        feed=feed_vars,
+                                        fetch_list=fetch_list)
+        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
+                                        dims, tolerance)
+
+    def test_uniform_distribution_dygraph(self,
+                                          batch_size=2,
+                                          dims=3,
+                                          tolerance=1e-6):
+        paddle.disable_static()
+
+        low_np = np.random.randn(batch_size, dims).astype('float32')
+        low_float = np.random.uniform(-2, 1)
+        high_float = np.random.uniform(1, 3)
+        high_np = np.random.uniform(-5.0, 5.0,
+                                    (batch_size, dims)).astype('float32')
+        values_np = np.random.randn(batch_size, dims).astype('float32')
+
+        data_list = [low_np, low_float, high_float, high_np, values_np]
+        output_list = self.build_uniform_dygraph(
+            batch_size, dims, low_float, high_float, high_np, low_np, values_np)
+
+        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
+                                        dims, tolerance)
+        paddle.enable_static()
+
+
+class DistributionTestError(unittest.TestCase):
+    def test_distribution_error(self):
+        distribution = Distribution()
+
+        self.assertRaises(NotImplementedError, distribution.sample)
+        self.assertRaises(NotImplementedError, distribution.entropy)
+
+        normal = Normal(0.0, 1.0)
+        self.assertRaises(NotImplementedError, distribution.kl_divergence,
+                          normal)
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        self.assertRaises(NotImplementedError, distribution.log_prob,
+                          value_tensor)
+        self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
+
+    def test_normal_error(self):
+        normal = Normal(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, normal.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, normal.sample, [2, 3], seed)
+
+        normal_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Normal
+        self.assertRaises(TypeError, normal.kl_divergence, normal_other)
+
+    def test_uniform_error(self):
+        uniform = Uniform(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, uniform.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
+
+
+class DistributionTestName(unittest.TestCase):
+    def get_prefix(self, string):
+        return (string.split('.')[0])
+
+    def test_normal_name(self):
+        name = 'test_normal'
+        normal1 = Normal(0.0, 1.0, name=name)
+        self.assertEqual(normal1.name, name)
+
+        normal2 = Normal(0.0, 1.0)
+        self.assertEqual(normal2.name, 'Normal')
+
+        paddle.enable_static()
+
+        sample = normal1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = normal1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = normal1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = normal1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        kl = normal1.kl_divergence(normal2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+    def test_uniform_name(self):
+        name = 'test_uniform'
+        uniform1 = Uniform(0.0, 1.0, name=name)
+        self.assertEqual(uniform1.name, name)
+
+        uniform2 = Uniform(0.0, 1.0)
+        self.assertEqual(uniform2.name, 'Uniform')
+
+        paddle.enable_static()
+
+        sample = uniform1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = uniform1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = uniform1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = uniform1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index cc3910d1b0c828f572f5e618b7aa9c55ecb93987..d18c8e25974441a6989b18a0fe13bac91251de9d 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -236,5 +237,501 @@ class TestDropoutOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestDropoutFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.dropout(x=input, p=0., training=False)
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='downscale_in_infer')
+            res4 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=0,
+                training=False,
+                mode='downscale_in_infer')
+            res6 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train')
+            res7 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='downscale_in_infer')
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train')
+            res9 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='downscale_in_infer')
+            res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+            res_np2 = np.zeros_like(in_np)
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches2 = exe.run(fluid.default_main_program(),
+                               feed={"input": in_np},
+                               fetch_list=[res10])
+            self.assertTrue(np.allclose(fetches2[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([40, 40]).astype("float32")
+                res_np = in_np
+                res_np2 = np.zeros_like(in_np)
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout(
+                    x=input, p=0., training=False)
+                res2 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=True,
+                    mode='upscale_in_train')
+                res3 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=True,
+                    mode='downscale_in_infer')
+                res4 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=False,
+                    mode='upscale_in_train')
+                res5 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=0,
+                    training=False,
+                    mode='downscale_in_infer')
+                res6 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=True,
+                    mode='upscale_in_train')
+                res7 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=True,
+                    mode='downscale_in_infer')
+                res8 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=False,
+                    mode='upscale_in_train')
+                res9 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=[0, 1],
+                    training=False,
+                    mode='downscale_in_infer')
+                res10 = paddle.nn.functional.dropout(
+                    x=input, p=1., training=True)
+
+            res_list = [res1, res2, res3, res4, res5, res6, res7, res8, res9]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+            self.assertTrue(np.allclose(res10.numpy(), res_np2))
+
+
+class TestDropoutFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.dropout(x1, p=0.5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_Variable2():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.dropout(x1, p=0.5, axis=0)
+
+            self.assertRaises(TypeError, test_Variable2)
+
+            def test_dtype():
+                # the input dtype of dropout must be float32 or float64
+                # float16 only can be set on GPU place
+                xr = fluid.data(name='xr', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout(xr, p=0.5)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_pdtype():
+                # p should be int or float
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, p='0.5')
+
+            self.assertRaises(TypeError, test_pdtype)
+
+            def test_pvalue():
+                # p should be 0.<=p<=1.
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, p=1.2)
+
+            self.assertRaises(ValueError, test_pvalue)
+
+            def test_mode():
+                # mode should be 'downscale_in_infer' or 'upscale_in_train'
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, mode='abc')
+
+            self.assertRaises(ValueError, test_mode)
+
+            def test_axis():
+                # axis should be int or list
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=1.2)
+
+            self.assertRaises(TypeError, test_axis)
+
+            def test_axis_max():
+                # maximum of axis should less than dimensions of x
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, 5])
+
+            self.assertRaises(ValueError, test_axis_max)
+
+            def test_axis_len():
+                # length of axis should not greater than dimensions of x
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, 1, 2, 3, 4])
+
+            self.assertRaises(ValueError, test_axis_len)
+
+
+class TestDropoutCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([40, 40]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestDropout2dFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5], dtype="float32")
+            res1 = paddle.nn.functional.dropout2d(
+                x=input, p=0., training=False, data_format='NCHW')
+            res2 = paddle.nn.functional.dropout2d(
+                x=input, p=0., training=False, data_format='NHWC')
+
+            in_np = np.random.random([2, 3, 4, 5]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([2, 3, 4, 5]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout2d(
+                    x=input, p=0., training=False, data_format='NCHW')
+                res2 = paddle.nn.functional.dropout2d(
+                    x=input, p=0., training=False, data_format='NHWC')
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestDropout2dFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_xdim():
+                # dimentions of x should be 4
+                x = fluid.data(name='x1', shape=[2, 3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout2d(x)
+
+            self.assertRaises(ValueError, test_xdim)
+
+            def test_dataformat():
+                # data_format should be 'NCHW' or 'NHWC'
+                x = fluid.data(name='x2', shape=[2, 3, 4, 5], dtype="int32")
+                paddle.nn.functional.dropout2d(x, data_format='CNHW')
+
+            self.assertRaises(ValueError, test_dataformat)
+
+
+class TestDropout2DCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 4, 5]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout2D(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestDropout3dFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5, 6], dtype="float32")
+            res1 = paddle.nn.functional.dropout3d(
+                x=input, p=0., training=False, data_format='NCDHW')
+            res2 = paddle.nn.functional.dropout3d(
+                x=input, p=0., training=False, data_format='NDHWC')
+
+            in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.dropout3d(
+                    x=input, p=0., training=False, data_format='NCDHW')
+                res2 = paddle.nn.functional.dropout3d(
+                    x=input, p=0., training=False, data_format='NDHWC')
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestDropout3dFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_xdim():
+                # dimentions of x should be 5
+                x = fluid.data(name='x1', shape=[2, 3, 4, 5], dtype="int32")
+                paddle.nn.functional.dropout3d(x)
+
+            self.assertRaises(ValueError, test_xdim)
+
+            def test_dataformat():
+                # data_format should be 'NCDHW' or 'NDHWC'
+                x = fluid.data(name='x2', shape=[2, 3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.dropout3d(x, data_format='CNDHW')
+
+            self.assertRaises(ValueError, test_dataformat)
+
+
+class TestDropout3DCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.Dropout3D(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
+class TestAlphaDropoutFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
+            res2 = paddle.nn.functional.alpha_dropout(
+                x=input, p=0., training=False)
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([40, 40]).astype("float32")
+                res_np = in_np
+                input = fluid.dygraph.to_variable(in_np)
+
+                res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
+                res2 = paddle.nn.functional.alpha_dropout(
+                    x=input, p=0., training=False)
+
+            res_list = [res1, res2]
+            for res in res_list:
+                self.assertTrue(np.allclose(res.numpy(), res_np))
+
+
+class TestAlphaDropoutFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of dropout must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.alpha_dropout(x1, p=0.5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_dtype():
+                # the input dtype of dropout must be float32 or float64
+                xr = fluid.data(name='xr', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.alpha_dropout(xr)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_pdtype():
+                # p should be int or float
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.alpha_dropout(x2, p='0.5')
+
+            self.assertRaises(TypeError, test_pdtype)
+
+            def test_pvalue():
+                # p should be 0.<=p<=1.
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.alpha_dropout(x2, p=1.2)
+
+            self.assertRaises(ValueError, test_pvalue)
+
+
+class TestAlphaDropoutCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_np = np.random.random([40, 40]).astype("float32")
+                result_np = input_np
+                input = fluid.dygraph.to_variable(input_np)
+                m = paddle.nn.AlphaDropout(p=0.)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), result_np))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..466226c53fabbd315acd19c6421f210d0ca225c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import collections
+from functools import reduce
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn.utils import weight_norm, remove_weight_norm
+
+
+class TestDygraphWeightNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+        self.set_data()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = None
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_value = numpy.random.random(
+                size=[self.batch_size] + data_shape).astype('float32')
+            self.data[data_name] = data_value
+
+    def norm_except_dim(self, w, dim=None):
+        shape = w.shape
+        ndims = len(shape)
+        shape_numel = reduce(lambda x, y: x * y, shape)
+        if dim == -1:
+            return numpy.linalg.norm(w, axis=None, keepdims=True)
+        elif dim == 0:
+            tile_shape = list(w.shape)
+            tile_shape[0] = 1
+            w_matrix = numpy.reshape(w, (shape[0], shape_numel // shape[0]))
+            return numpy.linalg.norm(w_matrix, axis=1, keepdims=True)
+        elif dim == (ndims - 1):
+            w_matrix = numpy.reshape(w, (shape_numel // shape[-1], shape[-1]))
+            return numpy.linalg.norm(w_matrix, axis=0, keepdims=True)
+        else:
+            perm = list(range(ndims))
+            perm_ori = list(range(ndims))
+            perm[0] = dim
+            perm[dim] = 0
+            p_transposed = numpy.transpose(w, perm)
+            return self.norm_except_dim(p_transposed, 0)
+
+    def weight_normalize(self, w, dim=None):
+        shape = w.shape
+        ndims = len(shape)
+        shape_numel = reduce(lambda x, y: x * y, shape)
+        v = w
+        g = self.norm_except_dim(w, dim)
+        g_mul = g
+
+        if dim == -1:
+            v_norm = v / (numpy.linalg.norm(v, axis=None, keepdims=True))
+        elif dim == 0:
+            w_matrix = numpy.reshape(w, (shape[0], shape_numel // shape[0]))
+            v_norm = v / numpy.linalg.norm(w_matrix, axis=1)
+            v_norm = numpy.reshape(v_norm, shape)
+            g = numpy.squeeze(g, axis=1)
+        elif dim == (ndims - 1):
+            w_matrix = numpy.reshape(w, (shape_numel // shape[-1], shape[-1]))
+            v_norm = v / numpy.linalg.norm(w_matrix, axis=0, keepdims=True)
+            v_norm = numpy.reshape(v_norm, shape)
+        else:
+            perm = list(range(ndims))
+            perm[0] = dim
+            perm[dim] = 0
+            p_transposed = numpy.transpose(v, perm)
+            transposed_shape = p_transposed.shape
+            transposed_shape_numel = reduce(lambda x, y: x * y,
+                                            transposed_shape)
+            p_matrix = numpy.reshape(
+                p_transposed, (p_transposed.shape[0],
+                               transposed_shape_numel // p_transposed.shape[0]))
+            v_norm = v / numpy.expand_dims(
+                numpy.expand_dims(
+                    numpy.linalg.norm(
+                        p_matrix, axis=1, keepdims=True), axis=0),
+                axis=(ndims - 1))
+            v_norm = numpy.reshape(v_norm, transposed_shape)
+            v_norm = numpy.transpose(v_norm, perm)
+            g = numpy.squeeze(g, axis=1)
+            if dim == 1:
+                eaxis = 2
+            elif dim == 2:
+                eaxis = 1
+            g_mul = numpy.expand_dims(
+                numpy.expand_dims(
+                    numpy.expand_dims(
+                        g, axis=0), axis=eaxis),
+                axis=(ndims - 1))
+        w = g_mul * v_norm
+        return g, v
+
+    def test_check_output(self):
+        fluid.enable_imperative()
+        linear = paddle.nn.Conv2d(2, 3, 3)
+        before_weight = linear.weight.numpy()
+        if self.dim == None:
+            self.dim = -1
+        wn = weight_norm(linear, dim=self.dim)
+        outputs = []
+        for name, data in self.data.items():
+            output = linear(fluid.dygraph.to_variable(data))
+            outputs.append(output.numpy())
+        after_weight = linear.weight
+        self.actual_outputs = [linear.weight_g.numpy(), linear.weight_v.numpy()]
+
+        expect_output = self.weight_normalize(before_weight, self.dim)
+
+        for expect, actual in zip(expect_output, self.actual_outputs):
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual), expect, atol=0.001))
+
+
+class TestDygraphWeightNormCase1(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 0
+
+
+class TestDygraphWeightNormCase2(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 1
+
+
+class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = 3
+
+
+class TestDygraphRemoveWeightNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = None
+
+    def test_check_output(self):
+        fluid.enable_imperative()
+        linear = paddle.nn.Conv2d(2, 3, 3)
+        before_weight = linear.weight
+        wn = weight_norm(linear, dim=self.dim)
+        rwn = remove_weight_norm(linear)
+        after_weight = linear.weight
+        self.assertTrue(
+            numpy.allclose(
+                before_weight.numpy(), after_weight.numpy(), atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 6eeb355a6ba3a9c20156ebfd1389d50e92a5a0f5..c941d7c5f34352ac0e762403d0e7e3f0238cbe36 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -397,7 +397,7 @@ class TestAddOp(unittest.TestCase):
             y_1 = paddle.add(x, y, name='add_res')
             self.assertEqual(('add_res' in y_1.name), True)
 
-    def test_alpha(self):
+    def test_declarative(self):
         with fluid.program_guard(fluid.Program()):
 
             def gen_data():
@@ -408,33 +408,12 @@ class TestAddOp(unittest.TestCase):
 
             x = fluid.data(name="x", shape=[3], dtype='float32')
             y = fluid.data(name="y", shape=[3], dtype='float32')
-            z = paddle.add(x, y, alpha=10)
+            z = paddle.add(x, y)
 
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
-            z_expected = np.array([12., 53., 24.])
-            self.assertEqual((z_value == z_expected).all(), True)
-
-    def test_alpha_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        with fluid.program_guard(fluid.Program()):
-
-            def gen_data():
-                return {
-                    "x": np.array([2, 3, 4]).astype('float32'),
-                    "y": np.array([1, 5, 2]).astype('float32')
-                }
-
-            x = fluid.data(name="x", shape=[3], dtype='float32')
-            y = fluid.data(name="y", shape=[3], dtype='float32')
-            z = paddle.add(x, y, alpha=-0.5)
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
-            z_expected = np.array([1.5, 0.5, 3.])
+            z_expected = np.array([3., 8., 6.])
             self.assertEqual((z_value == z_expected).all(), True)
 
     def test_dygraph(self):
@@ -443,9 +422,9 @@ class TestAddOp(unittest.TestCase):
             np_y = np.array([1, 5, 2]).astype('float64')
             x = fluid.dygraph.to_variable(np_x)
             y = fluid.dygraph.to_variable(np_y)
-            z = paddle.add(x, y, alpha=-0.5)
+            z = paddle.add(x, y)
             np_z = z.numpy()
-            z_expected = np.array([1.5, 0.5, 3.])
+            z_expected = np.array([3., 8., 6.])
             self.assertEqual((np_z == z_expected).all(), True)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index de0fc591b664728387ccb988f3611fe034989627..9ebaf8ff9438be8c8a57815be0798b861d05caaf 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -240,25 +240,124 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
             self.assertEqual((out_result == (2 / x)).all(), True)
 
 
-class TestDivOp(unittest.TestCase):
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+class TestDivideAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+        # rule 2: both the inputs are not Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 2
+            y = 4
+            res = paddle.divide(x, y)
+            exe = fluid.Executor(place)
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={},
+                           fetch_list=[res])
+            self.assertEqual(np_z[0] == 0.5, True)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x / y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 1.5, 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = y / x
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 8, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 0.25, 0.5])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x / y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([2, 3, 4]).astype('float64'),
+                               "y": np.array([1, 5, 2]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([2., 0.6, 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
 
-            y_1 = paddle.div(x, y, name='div_res')
-            self.assertEqual(('div_res' in y_1.name), True)
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
 
     def test_dygraph(self):
-        with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.div(x, y)
-            np_z = z.numpy()
-            z_expected = np.array([2., 0.6, 2.])
-            self.assertEqual((np_z == z_expected).all(), True)
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
+
+                # rule 2: both the inputs are not Tensor
+                z = paddle.divide(3, 2)
+                self.assertEqual(z.numpy()[0] == 1.5, True)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.divide, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x / y
+                z_expected = np.array([1., 1.5, 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 1, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = y / x
+                z_expected = np.array([1., 2., 0.5])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x / y
+                z_expected = np.array([2., 0.6, 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 104e896b6e440f5657a90e0ce741b49f72ba75c6..4fe085ce854726676bc1b1bef650419b3ebbfc86 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
 
@@ -56,6 +58,13 @@ class TestElementwiseModOp(OpTest):
         pass
 
 
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
@@ -65,5 +74,125 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
+class TestFloorDivideAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+        # rule 2: both the inputs are not Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 2
+            y = 4
+            res = paddle.floor_divide(x, y)
+            exe = fluid.Executor(place)
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={},
+                           fetch_list=[res])
+            self.assertEqual(np_z[0] == 0., True)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x // y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 1., 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = y // x
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 8, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([1., 0., 0.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x // y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([2, 3, 4]).astype('float64'),
+                               "y": np.array([1, 5, 2]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([2., 0., 2.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
+
+                # rule 2: both the inputs are not Tensor
+                z = paddle.floor_divide(3, 2)
+                self.assertEqual(z.numpy()[0] == 1., True)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x // y
+                z_expected = np.array([1, 1, 2])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 1, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = y // x
+                z_expected = np.array([1, 2, 0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x // y
+                z_expected = np.array([2., 0., 2.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index 2c0fdf51769782e046b1b18ebd31782c81fd49f0..25769a42aa261c0b5ae9fe2795a337c668580a99 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
 
@@ -82,5 +84,126 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
         self.dtype = np.float64
 
 
+class TestRemainderAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        # rule 1
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = np.array([1, 2, 3])
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 3: 
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 4: x is Tensor, y is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = 2
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={"x": np.array([2, 3, 4]).astype('float64')},
+                           fetch_list=[res])
+            z_expected = np.array([0., 1., 0.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 5: y is Tensor, x is scalar
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = 3
+            y = fluid.data(name="y", shape=[3], dtype="float32")
+            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3], dtype="float64")
+            y = fluid.data(name="y", shape=[1], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(fluid.default_main_program(),
+                           feed={
+                               "x": np.array([1., 2., 4]).astype('float64'),
+                               "y": np.array([1.5]).astype('float64')
+                           },
+                           fetch_list=[res])
+            z_expected = np.array([1., 0.5, 1.0])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+        # rule 6: y is Tensor, x is Tensor
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[6], dtype="float64")
+            y = fluid.data(name="y", shape=[1], dtype="float64")
+            exe = fluid.Executor(place)
+            res = x % y
+            np_z = exe.run(
+                fluid.default_main_program(),
+                feed={
+                    "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
+                    "y": np.array([2]).astype('float64')
+                },
+                fetch_list=[res])
+            z_expected = np.array([1., 0., 1., 1., 0., 1.])
+            self.assertEqual((np_z[0] == z_expected).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                # rule 1 : avoid numpy.ndarray
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
+
+                # rule 3: both the inputs are Tensor
+                np_x = np.array([2, 3, 4])
+                np_y = np.array([1, 5, 2])
+                x = paddle.to_tensor(np_x, dtype="float32")
+                y = paddle.to_tensor(np_y, dtype="float64")
+                self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
+
+                # rule 4: x is Tensor, y is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x, dtype="int32")
+                y = 2
+                z = x % y
+                z_expected = np.array([0, 1, 0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 5: y is Tensor, x is scalar
+                np_x = np.array([2, 3, 4])
+                x = paddle.to_tensor(np_x)
+                self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([1., 2., 4])
+                np_y = np.array([1.5])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([1., 0.5, 1.0])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+                # rule 6: y is Tensor, x is Tensor
+                np_x = np.array([-3., -2, -1, 1, 2, 3])
+                np_y = np.array([2.])
+                x = paddle.to_tensor(np_x)
+                y = paddle.to_tensor(np_y)
+                z = x % y
+                z_expected = np.array([1., 0., 1., 1., 0., 1.])
+                self.assertEqual((z_expected == z.numpy()).all(), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index e86f18a62167b7feab1549072fc296f847c00491..12b75c8bf703d2b31e6abb08bb233fb2874828ce 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -29,7 +29,7 @@ class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -56,7 +56,7 @@ class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -83,7 +83,7 @@ class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -110,7 +110,7 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -137,7 +137,7 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -164,7 +164,7 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.005
         dtype = np.float64
 
@@ -191,7 +191,7 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.0001
         dtype = np.float64
 
@@ -219,7 +219,7 @@ class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 4, 5]
         eps = 0.0001
         dtype = np.float64
 
diff --git a/python/paddle/fluid/tests/unittests/test_erf_op.py b/python/paddle/fluid/tests/unittests/test_erf_op.py
index 93ab0212f136adfedacb52a2fde47e15edf279d3..964e704c6a2ccbdc96fc281f6e417caf8351cdf7 100644
--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
@@ -19,6 +19,7 @@ import numpy as np
 from scipy.special import erf
 from op_test import OpTest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
 
@@ -58,6 +59,12 @@ class TestErfLayer(unittest.TestCase):
         if fluid.is_compiled_with_cuda():
             self._test_case(fluid.CUDAPlace(0))
 
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.erf(x, name='erf')
+            self.assertTrue('erf' in y.name)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b1e3c5a28a5498d1a06654ea0a4ddcac6c7592b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
@@ -0,0 +1,84 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+class TestExecutor(unittest.TestCase):
+    def net(self):
+        lr = fluid.data(name="lr", shape=[1], dtype='float32')
+        x = fluid.data(name="x", shape=[None, 1], dtype='float32')
+        y = fluid.data(name="y", shape=[None, 1], dtype='float32')
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+
+        opt = fluid.optimizer.Adam(learning_rate=lr)
+        opt.minimize(avg_cost)
+
+        return lr, avg_cost
+
+    def test_program_check_feed(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.scope_guard(scope):
+                cpu = fluid.CPUPlace()
+                exe = fluid.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                train_data = [[1.0], [2.0], [3.0], [4.0]]
+                y_true = [[2.0], [4.0], [6.0], [8.0]]
+                a = 0
+                with self.assertRaises(ValueError):
+                    exe.run(feed={'x': train_data,
+                                  'lr': a},
+                            fetch_list=[lr, cost],
+                            return_numpy=False,
+                            use_prune=True)
+
+    def test_compiled_program_check_feed(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.scope_guard(scope):
+                cpu = fluid.CPUPlace()
+                exe = fluid.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                compiled_prog = fluid.CompiledProgram(
+                    main_program).with_data_parallel(loss_name=cost.name)
+                train_data = [[1.0], [2.0], [3.0], [4.0]]
+                y_true = [[2.0], [4.0], [6.0], [8.0]]
+                a = 0
+                with self.assertRaises(ValueError):
+                    exe.run(compiled_prog,
+                            feed={'x': train_data,
+                                  'lr': a},
+                            fetch_list=[lr, cost],
+                            return_numpy=False,
+                            use_prune=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..4bc6bf3744f26cf7618d255f306bdb8f5fefb7a0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+class TestExpandAsOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(100).astype("float64")
+        target_tensor = np.random.rand(2, 100).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [2, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(10, 12).astype("float64")
+        target_tensor = np.random.rand(10, 12).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(2, 3, 20).astype("float64")
+        target_tensor = np.random.rand(2, 3, 20).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [1, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("float64")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float64")
+        self.inputs = {'X': x, 'target_tensor': target_tensor}
+        self.attrs = {}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandAsV2Error(unittest.TestCase):
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype="uint8")
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
+            self.assertRaises(TypeError, paddle.tensor.expand_as, x1, x2)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tensor.expand_as, x3, x2)
+
+
+# Test python API
+class TestExpandAsV2API(unittest.TestCase):
+    def test_api(self):
+        input1 = np.random.random([12, 14]).astype("float32")
+        input2 = np.random.random([2, 12, 14]).astype("float32")
+        x = fluid.layers.data(
+            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+
+        y = fluid.layers.data(
+            name='target_tensor',
+            shape=[2, 12, 14],
+            append_batch_size=False,
+            dtype="float32")
+
+        out_1 = paddle.expand_as(x, y=y)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res_1 = exe.run(fluid.default_main_program(),
+                        feed={"x": input1,
+                              "target_tensor": input2},
+                        fetch_list=[out_1])
+        assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..aee6ca249f535b9c06c00a6806ac491be16cd4b3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -0,0 +1,234 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+
+# Situation 1: shape is a list(without tensor)
+class TestExpandV2OpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.attrs = {'shape': self.shape}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.shape = [100]
+        self.expand_times = [1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandV2OpRank2_DimExpanding(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.shape = [2, 120]
+        self.expand_times = [2, 1]
+
+
+class TestExpandV2OpRank2(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 140]
+        self.shape = [12, 140]
+        self.expand_times = [12, 1]
+
+
+class TestExpandV2OpRank3_Corner(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.shape = (2, 10, 5)
+        self.expand_times = (1, 1, 1)
+
+
+class TestExpandV2OpRank4(TestExpandV2OpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.shape = (-1, -1, -1, -1)
+        self.expand_times = (1, 1, 1, 1)
+
+
+# Situation 2: shape is a list(with tensor)
+class TestExpandV2OpRank1_tensor_attr(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+        expand_shapes_tensor = []
+        for index, ele in enumerate(self.expand_shape):
+            expand_shapes_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'expand_shapes_tensor': expand_shapes_tensor,
+        }
+        self.attrs = {"shape": self.infer_expand_shape}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.expand_times = [1]
+        self.expand_shape = [100]
+        self.infer_expand_shape = [-1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.expand_times = [1, 1]
+        self.expand_shape = [12, 14]
+        self.infer_expand_shape = [12, -1]
+
+
+# Situation 3: shape is a tensor
+class TestExpandV2OpRank1_tensor(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'Shape': np.array(self.expand_shape).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.expand_times = [2, 1]
+        self.expand_shape = [2, 100]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+# Situation 4: input x is Integer
+class TestExpandV2OpInteger(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int32")
+        }
+        self.attrs = {'shape': [2, 4, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 5: input x is Bool
+class TestExpandV2OpBoolean(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
+        self.attrs = {'shape': [2, 4, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 56: input x is Integer
+class TestExpandV2OpInt64_t(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int64")
+        }
+        self.attrs = {'shape': [2, 4, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestExpandV2Error(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            shape = [2, 2]
+            self.assertRaises(TypeError, paddle.tensor.expand, x1, shape)
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
+            self.assertRaises(TypeError, paddle.tensor.expand, x2, shape)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tensor.expand, x3, shape)
+
+
+# Test python API
+class TestExpandV2API(unittest.TestCase):
+    def test_api(self):
+        input = np.random.random([12, 14]).astype("float32")
+        x = fluid.layers.data(
+            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+
+        positive_2 = fluid.layers.fill_constant([1], "int32", 12)
+        expand_shape = fluid.layers.data(
+            name="expand_shape",
+            shape=[2],
+            append_batch_size=False,
+            dtype="int32")
+
+        out_1 = paddle.expand(x, shape=[12, 14])
+        out_2 = paddle.expand(x, shape=[positive_2, 14])
+        out_3 = paddle.expand(x, shape=expand_shape)
+
+        g0 = fluid.backward.calc_gradient(out_2, x)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
+                                      feed={
+                                          "x": input,
+                                          "expand_shape":
+                                          np.array([12, 14]).astype("int32")
+                                      },
+                                      fetch_list=[out_1, out_2, out_3])
+        assert np.array_equal(res_1, np.tile(input, (1, 1)))
+        assert np.array_equal(res_2, np.tile(input, (1, 1)))
+        assert np.array_equal(res_3, np.tile(input, (1, 1)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py
index 1a0a4ecb74d56910b3f92924085203f83b2c0145..9b541c323eceaa32591dbdc2ec149868ad7e8673 100644
--- a/python/paddle/fluid/tests/unittests/test_eye_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eye_op.py
@@ -74,73 +74,70 @@ class TestEyeOp2(OpTest):
 
 class API_TestTensorEye(unittest.TestCase):
     def test_out(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10)
             place = fluid.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, dtype="float32")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10, num_columns=7, dtype="float64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, 7, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10, dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.imperative.guard():
-            out = paddle.eye(10, dtype="int64")
-            expected_result = np.eye(10, dtype="int64")
+        paddle.disable_static()
+        out = paddle.eye(10, dtype="int64")
+        expected_result = np.eye(10, dtype="int64")
+        paddle.enable_static()
         self.assertEqual((out.numpy() == expected_result).all(), True)
 
-        with paddle.imperative.guard():
-            batch_shape = [2]
-            out = fluid.layers.eye(10,
-                                   10,
-                                   dtype="int64",
-                                   batch_shape=batch_shape)
-            result = np.eye(10, dtype="int64")
-            expected_result = []
-            for index in reversed(batch_shape):
-                tmp_result = []
-                for i in range(index):
-                    tmp_result.append(result)
-                result = tmp_result
-                expected_result = np.stack(result, axis=0)
+        paddle.disable_static()
+        batch_shape = [2]
+        out = fluid.layers.eye(10, 10, dtype="int64", batch_shape=batch_shape)
+        result = np.eye(10, dtype="int64")
+        expected_result = []
+        for index in reversed(batch_shape):
+            tmp_result = []
+            for i in range(index):
+                tmp_result.append(result)
+            result = tmp_result
+            expected_result = np.stack(result, axis=0)
+        paddle.enable_static()
         self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
                          True)
         self.assertEqual((out.numpy() == expected_result).all(), True)
 
-        with paddle.imperative.guard():
-            batch_shape = [3, 2]
-            out = fluid.layers.eye(10,
-                                   10,
-                                   dtype="int64",
-                                   batch_shape=batch_shape)
-            result = np.eye(10, dtype="int64")
-            expected_result = []
-            for index in reversed(batch_shape):
-                tmp_result = []
-                for i in range(index):
-                    tmp_result.append(result)
-                result = tmp_result
-                expected_result = np.stack(result, axis=0)
+        paddle.disable_static()
+        batch_shape = [3, 2]
+        out = fluid.layers.eye(10, 10, dtype="int64", batch_shape=batch_shape)
+        result = np.eye(10, dtype="int64")
+        expected_result = []
+        for index in reversed(batch_shape):
+            tmp_result = []
+            for i in range(index):
+                tmp_result.append(result)
+            result = tmp_result
+            expected_result = np.stack(result, axis=0)
+        paddle.enable_static()
         self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
                          True)
         self.assertEqual((out.numpy() == expected_result).all(), True)
 
     def test_errors(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
 
             def test_num_rows_type_check():
                 paddle.eye(-1, dtype="int64")
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 0812b02b47db7fa2d43e1d3bbd0a3f7b59911326..b30e0a6775ea9901d8c2a3a56b2e80141fffd23c 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -31,45 +31,45 @@ def dequantize_max_abs(x, scale, max_range):
     return y
 
 
-def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
+def channel_wise_quantize_max_abs(x, quant_bit=8, quant_axis=0):
+    assert quant_axis in [0, 1], "The quant_axis should be 0 or 1."
     scales = []
-    if not use_second_dim:
+    y = x.copy()
+    max_range = math.pow(2, quant_bit - 1) - 1
+    if quant_axis == 0:
         for i in range(x.shape[0]):
-            scales.append(np.max(np.abs(x[i])).astype("float32"))
-        y = x.copy()
-        max_range = math.pow(2, quant_bit - 1) - 1
-        for i, scale in enumerate(scales):
-            y[i] = np.round(x[i] / scale * max_range)
-    else:
-        for i in range(x.shape[0]):
-            s = []
-            for j in range(x.shape[1]):
-                s.append(np.max(np.abs(x[i][j])).astype("float32"))
-            scales.append(s)
-        scales = np.amax(np.array(scales), axis=0)
-        y = x.copy()
-        max_range = math.pow(2, quant_bit - 1) - 1
-        for i in range(x.shape[0]):
-            for j, scale in enumerate(scales):
-                y[i][j] = np.round(x[i][j] / scale * max_range)
+            scale = np.max(np.abs(x[i])).astype("float32")
+            scales.append(scale)
+            y[i] = np.round(x[i] * max_range / scale)
+    elif quant_axis == 1:
+        for i in range(x.shape[1]):
+            scale = np.max(np.abs(x[:, i])).astype("float32")
+            scales.append(scale)
+            y[:, i] = np.round(x[:, i] * max_range / scale)
     return y, scales
 
 
 def channel_wise_dequantize_max_abs(x,
                                     scales,
                                     quant_bits,
+                                    quant_axis,
                                     activation_scale=None):
-    if activation_scale is None:
-        y = x.copy()
-        for i in range(x.shape[0]):
-            y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
+    assert quant_axis in [0, 1], "The quant_axis should be 0 or 1."
+
+    if isinstance(quant_bits, list):
+        max_range = math.pow(2, quant_bits[0] - 1) - 1
     else:
-        y = x.copy()
+        max_range = math.pow(2, quant_bits - 1) - 1
+    y = x.copy()
+    if quant_axis == 0:
         for i in range(x.shape[0]):
-            for j in range(x.shape[1]):
-                y[i][j] = (scales[j] /
-                           (math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
-        y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
+            y[i] = x[i] * scales[i] / max_range
+    elif quant_axis == 1:
+        for i in range(x.shape[1]):
+            y[:, i] = x[:, i] * scales[i] / max_range
+
+    if activation_scale is not None:
+        y = y * activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
     return y
 
 
@@ -83,9 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
         self.set_args()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
         x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(
-            x, self.quant_bits[0], use_second_dim=True)
-        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0], 1)
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, 1,
                                               self.activation_scale)
 
         self.inputs = {
@@ -105,25 +104,39 @@ class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
     def set_args(self):
         self.quant_bits = [8]
         self.data_type = "float32"
+        self.quant_axis = 0
 
     def setUp(self):
         self.set_args()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
         x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
-        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0],
+                                                   self.quant_axis)
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
+                                              self.quant_axis)
 
         self.inputs = {
             'X': yq,
             'Scales': [("scales0", np.array(scales).astype(self.data_type))]
         }
-        self.attrs = {'quant_bits': self.quant_bits}
+        self.attrs = {
+            'quant_bits': self.quant_bits,
+            'quant_axis': self.quant_axis
+        }
         self.outputs = {'Out': ydq}
 
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeChannelWiseDequantizeMaxAbsOpOneScale1(
+        TestFakeChannelWiseDequantizeMaxAbsOpOneScale):
+    def set_args(self):
+        self.quant_bits = [8]
+        self.data_type = "float32"
+        self.quant_axis = 1
+
+
 class TestFakeDequantizeMaxAbsOp(OpTest):
     def set_args(self):
         self.num_bits = 8
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 1c8335e3bceab24cba9364a96f6907d2cf585fe0..7835fd3f53ddb7f9a95313c6cc5fc7b72ae6d664 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -72,28 +72,62 @@ class TestFakeQuantizeOp2(OpTest):
 
 class TestFakeChannelWiseQuantizeOp(OpTest):
     def setUp(self):
+        self.set_arg()
+        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+
         self.op_type = "fake_channel_wise_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {
-            'X': np.random.random((4, 3, 64, 64)).astype("float32"),
-        }
+        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+
         scales = []
-        for i in range(self.inputs['X'].shape[0]):
-            scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
         outputs = self.inputs['X'].copy()
-        for i, scale in enumerate(scales):
-            outputs[i] = np.round(outputs[i] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1))
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        if self.quant_axis == 0:
+            for i in range(self.inputs['X'].shape[0]):
+                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
+                scales.append(scale_v)
+                outputs[i] = np.round(outputs[i] / scale_v * bnt)
+        elif self.quant_axis == 1:
+            for i in range(self.inputs['X'].shape[1]):
+                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
+                    "float32")
+                scales.append(scale_v)
+                outputs[:, i] = np.round(outputs[:, i] / scale_v * bnt)
 
         self.outputs = {
             'Out': outputs,
             'OutScale': np.array(scales).astype("float32"),
         }
 
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {
+            'X': np.random.random((20, 15, 6, 6)).astype("float32"),
+        }
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeChannelWiseQuantizeOp1(TestFakeChannelWiseQuantizeOp):
+    def set_quant_axis(self):
+        self.quant_axis = 1
+        self.inputs = {
+            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
+        }
+
+
+class TestFakeChannelWiseQuantizeOp2(TestFakeChannelWiseQuantizeOp):
+    def set_quant_axis(self):
+        self.quant_axis = 0
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
+class TestFakeChannelWiseQuantizeOp3(TestFakeChannelWiseQuantizeOp):
+    def set_quant_axis(self):
+        self.quant_axis = 1
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_range_abs_max"
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 3eb761f925a677dcbaa3d7e39221299013f84b33..3475320eeebc55a14dd569410610b70ae35e65a3 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -269,18 +269,26 @@ class TestFillConstantAPI(unittest.TestCase):
         out_6 = fluid.layers.fill_constant(
             shape=shape_tensor_int64, dtype=np.float32, value=1.1)
 
-        val = fluid.layers.fill_constant(shape=[1], dtype=np.float32, value=1.1)
+        val1 = fluid.layers.fill_constant(
+            shape=[1], dtype=np.float32, value=1.1)
+        val2 = fluid.layers.fill_constant(
+            shape=[1], dtype=np.float64, value=1.1)
         out_7 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=val)
+            shape=shape_tensor_int64, dtype=np.float32, value=val1)
+
+        out_8 = fluid.layers.fill_constant(
+            shape=shape_tensor_int64, dtype=np.float32, value=val2)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
-        res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
+        res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
             fluid.default_main_program(),
             feed={
                 "shape_tensor_int32": np.array([1, 2]).astype("int32"),
                 "shape_tensor_int64": np.array([1, 2]).astype("int64"),
             },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7])
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
 
         assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
@@ -289,6 +297,38 @@ class TestFillConstantAPI(unittest.TestCase):
         assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32"))
+        assert np.array_equal(res_8, np.full([1, 2], 1.1, dtype="float32"))
+
+
+class TestFillConstantImperative(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard():
+            data1 = np.array([1, 2]).astype('int32')
+            data2 = np.array([1.1]).astype('float32')
+            data3 = np.array([88]).astype('int32')
+            shape = fluid.dygraph.to_variable(data1)
+            val = fluid.dygraph.to_variable(data2)
+            value = fluid.dygraph.to_variable(data3)
+            res1 = fluid.layers.fill_constant(
+                shape=[1, 2], dtype='float32', value=1.1)
+            res2 = fluid.layers.fill_constant(
+                shape=shape, dtype='float32', value=1.1)
+            res3 = fluid.layers.fill_constant(
+                shape=shape, dtype='float32', value=val)
+            res4 = fluid.layers.fill_constant(
+                shape=shape, dtype='int32', value=value)
+            assert np.array_equal(
+                res1.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res2.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res3.numpy(), np.full(
+                    [1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(
+                res4.numpy(), np.full(
+                    [1, 2], 88, dtype="int32"))
 
 
 class TestFillConstantOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..642044bb4b1152b0c6d2b5a8a64e22410f9bd151
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten_contiguous_range"
+        self.start_axis = 0
+        self.stop_axis = -1
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=["XShape"])
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = -1
+        self.new_shape = (120)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_1(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 1
+        self.stop_axis = 2
+        self.new_shape = (3, 10, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_2(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_3(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 2
+        self.new_shape = (30, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_4(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = -2
+        self.stop_axis = -1
+        self.new_shape = (3, 2, 20)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_5(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 2
+        self.stop_axis = 2
+        self.new_shape = (3, 2, 5, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.start_axis = 3
+        self.stop_axis = 5
+        self.new_shape = (3, 2, 3, 32)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlatten2OpError(unittest.TestCase):
+    def test_errors(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_ValueError1():
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
+            out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
+
+        self.assertRaises(ValueError, test_ValueError1)
+
+        def test_ValueError2():
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
+            paddle.flatten(x_var, start_axis=10, stop_axis=1)
+
+        self.assertRaises(ValueError, test_ValueError2)
+
+        def test_ValueError3():
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
+            paddle.flatten(x_var, start_axis=2, stop_axis=10)
+
+        self.assertRaises(ValueError, test_ValueError3)
+
+        def test_type():
+            # dtype must be float32, float64, int8, int32, int64.
+            x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                           image_shape[3]).reshape(image_shape) / 100.
+            x2 = x2.astype('float16')
+            x2_var = paddle.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            paddle.flatten(x2_var)
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_InputError():
+            out = paddle.flatten(x)
+
+        self.assertRaises(ValueError, test_InputError)
+
+
+class TestFlattenPython(unittest.TestCase):
+    def test_python_api(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_InputError():
+            out = paddle.flatten(x)
+
+        self.assertRaises(ValueError, test_InputError)
+
+        def test_Negative():
+            paddle.disable_static()
+            img = paddle.to_variable(x)
+            out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
+            return out.numpy().shape
+
+        res_shape = test_Negative()
+        self.assertTrue((2, 3, 16) == res_shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet.py b/python/paddle/fluid/tests/unittests/test_fleet.py
index 449f31faf4035971f996e76612f10c882ce9179c..a705d5ee661fd5d0d28b791d6db4624b78281743 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet.py
@@ -34,7 +34,8 @@ class TestFleet1(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -48,10 +49,10 @@ class TestFleet1(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index ae4b5d7ecd7c5131e38904a0d8fde0b9bb4fbb89..38c3903306e6e76188cdb50476d6797814c434e9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import unittest
 import paddle
 import os
@@ -23,8 +25,6 @@ class TestFleetAMPOptimizer(unittest.TestCase):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
     def test_amp_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         input_x = paddle.fluid.layers.data(
@@ -38,7 +38,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
         strategy.amp_configs = {
             "init_loss_scaling": 32768,
@@ -51,7 +51,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
             "custom_black_list": ['tanh'],
         }
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 20542da3f05ec84b51dee8a9c5913bb20630f4a2..9e651dea24ba7f35f3785093da8ac73dde07be5a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -14,7 +14,10 @@
 
 import unittest
 import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import os
+import paddle.fluid as fluid
 
 
 class TestFleetBase(unittest.TestCase):
@@ -26,67 +29,49 @@ class TestFleetBase(unittest.TestCase):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_init(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
 
     def test_is_first_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_first_worker():
             print("test fleet first worker done.")
 
     def test_worker_index(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         print(fleet.worker_index())
 
     def test_worker_num(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         print(fleet.worker_num())
 
     def test_is_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             print("test fleet is worker")
 
     def test_worker_endpoints(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         print(fleet.worker_endpoints(to_string=True))
 
     def test_server_num(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
             print("fleet server num: {}".format(fleet.server_num()))
 
     def test_server_index(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
             print("fleet server index: {}".format(fleet.server_index()))
 
     def test_server_endpoints(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
@@ -94,83 +79,50 @@ class TestFleetBase(unittest.TestCase):
                 fleet.server_endpoints(to_string=True)))
 
     def test_is_server(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_server():
             print("test fleet is server")
 
     def test_util(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         self.assertEqual(fleet.util, None)
 
     def test_barrier_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.barrier_worker()
 
     def test_init_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.init_worker()
 
     def test_run_server(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.run_worker()
 
     def test_stop_worker(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         if fleet.is_worker():
             fleet.stop_worker()
 
     def test_distributed_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        strategy = fleet.DistributedStrategy()
-        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-
-    def test_minimize(self):
-        import paddle
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        strategy = fleet.DistributedStrategy()
         optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+    def test_exception(self):
+        import paddle.distributed.fleet as fleet
+        self.assertRaises(Exception, fleet.init_worker)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d666ea6740be149723e3bdbc00857a8931ce318e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_ps_minimize(self):
+        import paddle
+        import paddle.distributed.fleet as fleet
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = fleet.PaddleCloudRoleMaker(is_collective=False)
+        fleet.init(role)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        pe = fluid.ParallelExecutor(use_cuda=False, loss_name=avg_cost.name)
+        compiled_prog = fluid.compiler.CompiledProgram(
+            fluid.default_main_program())
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor=pe)
+
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor="exe")
+
+        self.assertRaises(
+            Exception,
+            fleet.save_inference_model,
+            dirname='/tmp/',
+            feeded_var_names=['x', 'y'],
+            target_vars=[avg_cost],
+            executor=exe,
+            main_program=compiled_prog)
+
+        self.assertRaises(
+            Exception, fleet.save_persistables, executor=pe, dirname='/tmp/')
+
+        self.assertRaises(
+            Exception, fleet.save_persistables, executor="exe", dirname='/tmp/')
+
+        self.assertRaises(
+            Exception,
+            fleet.save_persistables,
+            executor=exe,
+            dirname='/tmp/',
+            main_program=compiled_prog)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e888ab0eb3ca597bf62245ff9f3024fe81ee95
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_collective_minimize(self):
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        strategy = fleet.DistributedStrategy()
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b3fbb86a4af55d6838df3a628bf2cf194c5235d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.fluid as fluid
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_fleet_init(self):
+        import paddle.distributed.fleet as fleet
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        role = fleet.PaddleCloudRoleMaker(is_collective=False)
+        fleet.init(role)
+        fleet.init()
+        fleet.init(is_collective=False)
+        self.assertRaises(Exception, fleet.init, is_collective="F")
+        self.assertRaises(Exception, fleet.init, role_maker="F")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index 3318b67cadc74080468c0eab7ae1df1e834b5952..66baf8faac51ebd19689a5b22b87f3a454842fac 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -15,12 +15,15 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+from paddle.fluid.incubate.checkpoint.auto_checkpoint import ExeTrainStatus
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
 from paddle.fluid.incubate.fleet.utils.fs import LocalFS
 from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
 class FleetTest(unittest.TestCase):
@@ -49,24 +52,35 @@ class FleetTest(unittest.TestCase):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
 
-        status = TrainStatus(2)
-        fleet.save_checkpoint(exe, dir_path, train_status=status, fs=fs)
-        n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
+        status = ExeTrainStatus()
+        status.epoch_no = 2
+        _, n1 = fleet.save_checkpoint(
+            exe, dir_path, trainer_id=0, train_status=status, fs=fs)
 
-        status2 = fleet.load_checkpoint(exe, dir_path, trainer_id=0, fs=fs)
+        status2 = ExeTrainStatus()
+        fleet.load_checkpoint(
+            exe, dir_path, trainer_id=0, fs=fs, train_status=status2)
         self.assertEqual(status2, status)
 
-        fleet.save_checkpoint(exe, dir_path, train_status=status, fs=fs)
-        n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
+        _, n2 = fleet.save_checkpoint(
+            exe,
+            dir_path,
+            trainer_id=0,
+            train_status=status,
+            fs=fs,
+            remain_all_checkpoint=False)
         self.assertEqual(n2, n1 + 1)
 
-        fleet.clean_redundant_checkpoints(dir_path, fs=fs)
+        c = CheckpointSaver(fs)
+        cp_nos = c.get_checkpoint_no(dir_path)
+        assert len(cp_nos) == 1  # cleanup all others
 
         # unnormal
         # test remain_all_checkpoint 
         fleet.save_checkpoint(
             exe,
             dir_path,
+            trainer_id=0,
             train_status=status,
             fs=fs,
             remain_all_checkpoint=False)
@@ -79,6 +93,7 @@ class FleetTest(unittest.TestCase):
             fleet.save_checkpoint(
                 exe,
                 dir_path,
+                trainer_id=0,
                 train_status=status,
                 fs=fs,
                 cache_path=cache_path)
@@ -88,8 +103,13 @@ class FleetTest(unittest.TestCase):
 
         # can't load under a file
         try:
-            status2 = fleet.load_checkpoint(
-                exe, dir_path, trainer_id=0, fs=fs, cache_path=cache_path)
+            fleet.load_checkpoint(
+                exe,
+                dir_path,
+                trainer_id=0,
+                train_status=status2,
+                fs=fs,
+                cache_path=cache_path)
             self.assertFalse(True)
         except:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
old mode 100644
new mode 100755
index 0590650bd02f5535b9c35bae187e77bc7274901c..55d4ff7726aace09e486156d26efdecf22b310a5
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -14,9 +14,10 @@
 
 import unittest
 import paddle
+from paddle import fluid
 import os
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetDGCOptimizer(unittest.TestCase):
@@ -25,32 +26,42 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         os.environ[
             "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
-    def net(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
-        strategy.dgc = True
-        strategy.dgc_configs = {
-            "rampup_begin_step": 128,
-            "rampup_step": 100,
-            "sparsity": [0.996, 0.999]
-        }
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.dgc = True
+                strategy.dgc_configs = {
+                    "rampup_begin_step": 128,
+                    "rampup_step": 100,
+                    "sparsity": [0.996, 0.999]
+                }
         return avg_cost, strategy
 
     def test_dgc_optimizer(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -59,8 +70,10 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         self.assertIn('dgc_momentum', ops)
 
     def test_dgc_not_apply_with_adam(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -72,8 +85,11 @@ class TestFleetDGCOptimizer(unittest.TestCase):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 4994e4514d784f16006d25b4d714bfffc80af2de..8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -19,7 +19,7 @@ import os
 
 class TestStrategyConfig(unittest.TestCase):
     def test_amp(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
         self.assertEqual(strategy.amp, True)
         strategy.amp = False
@@ -28,7 +28,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.amp, False)
 
     def test_amp_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {
             "init_loss_scaling": 32768,
             "decr_every_n_nan_or_inf": 2,
@@ -41,7 +41,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.amp_configs["init_loss_scaling"], 32768)
 
     def test_recompute(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.recompute = True
         self.assertEqual(strategy.recompute, True)
         strategy.recompute = False
@@ -50,13 +50,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.recompute, False)
 
     def test_recompute_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"checkpoints": ["x", "y"]}
         strategy.recompute_configs = configs
         self.assertEqual(len(strategy.recompute_configs["checkpoints"]), 2)
 
     def test_pipeline(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.pipeline = True
         self.assertEqual(strategy.pipeline, True)
         strategy.pipeline = False
@@ -65,13 +65,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.pipeline, False)
 
     def test_pipeline_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"micro_batch": 4}
         strategy.pipeline_configs = configs
         self.assertEqual(strategy.pipeline_configs["micro_batch"], 4)
 
     def test_localsgd(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
         self.assertEqual(strategy.localsgd, True)
         strategy.localsgd = False
@@ -80,13 +80,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.localsgd, False)
 
     def test_localsgd_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"k_steps": 4}
         strategy.localsgd_configs = configs
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
 
     def test_dgc(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.dgc = True
         self.assertEqual(strategy.dgc, True)
         strategy.dgc = False
@@ -95,7 +95,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.dgc, False)
 
     def test_sync_nccl_allreduce(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.sync_nccl_allreduce = True
         self.assertEqual(strategy.sync_nccl_allreduce, True)
         strategy.sync_nccl_allreduce = False
@@ -104,14 +104,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.sync_nccl_allreduce, False)
 
     def test_nccl_comm_num(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.nccl_comm_num = 1
         self.assertEqual(strategy.nccl_comm_num, 1)
         strategy.nccl_comm_num = "2"
         self.assertEqual(strategy.nccl_comm_num, 1)
 
     def test_use_hierarchical_allreduce(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.use_hierarchical_allreduce = True
         self.assertEqual(strategy.use_hierarchical_allreduce, True)
         strategy.use_hierarchical_allreduce = False
@@ -120,14 +120,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.use_hierarchical_allreduce, False)
 
     def test_hierarchical_allreduce_inter_nranks(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.hierarchical_allreduce_inter_nranks = 8
         self.assertEqual(strategy.hierarchical_allreduce_inter_nranks, 8)
         strategy.hierarchical_allreduce_inter_nranks = "4"
         self.assertEqual(strategy.hierarchical_allreduce_inter_nranks, 8)
 
     def test_sync_batch_norm(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.sync_batch_norm = True
         self.assertEqual(strategy.sync_batch_norm, True)
         strategy.sync_batch_norm = False
@@ -136,7 +136,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.sync_batch_norm, False)
 
     def test_fuse_all_reduce_ops(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.fuse_all_reduce_ops = True
         self.assertEqual(strategy.fuse_all_reduce_ops, True)
         strategy.fuse_all_reduce_ops = False
@@ -145,21 +145,21 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.fuse_all_reduce_ops, False)
 
     def test_fuse_grad_size_in_MB(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.fuse_grad_size_in_MB = 50
         self.assertEqual(strategy.fuse_grad_size_in_MB, 50)
         strategy.fuse_grad_size_in_MB = "40"
         self.assertEqual(strategy.fuse_grad_size_in_MB, 50)
 
     def test_fuse_grad_size_in_TFLOPS(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy._fuse_grad_size_in_TFLOPS = 0.1
         self.assertGreater(strategy._fuse_grad_size_in_TFLOPS, 0.09)
         strategy._fuse_grad_size_in_TFLOPS = "0.3"
         self.assertGreater(strategy._fuse_grad_size_in_TFLOPS, 0.09)
 
     def test_gradient_merge(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.gradient_merge = True
         self.assertEqual(strategy.gradient_merge, True)
         strategy.gradient_merge = False
@@ -168,13 +168,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.gradient_merge, False)
 
     def test_gradient_merge_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"k_steps": 4}
         strategy.gradient_merge_configs = configs
         self.assertEqual(strategy.gradient_merge_configs["k_steps"], 4)
 
     def test_lars(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.lars = True
         self.assertEqual(strategy.lars, True)
         strategy.lars = False
@@ -183,7 +183,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.lars, False)
 
     def test_lamb(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.lamb = True
         self.assertEqual(strategy.lamb, True)
         strategy.lamb = False
@@ -192,22 +192,23 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.lamb, False)
 
     def test_a_sync(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         self.assertEqual(strategy.a_sync, True)
         strategy.a_sync = False
         self.assertEqual(strategy.a_sync, False)
-        strategy.a_sync = "True"
-        self.assertEqual(strategy.a_sync, False)
+
+        with self.assertRaises(ValueError):
+            strategy.a_sync = "True"
 
     def test_a_sync_configs(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {"k_steps": 1000}
         strategy.a_sync_configs = configs
         self.assertEqual(strategy.a_sync_configs["k_steps"], 1000)
 
     def test_elastic(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.elastic = True
         self.assertEqual(strategy.elastic, True)
         strategy.elastic = False
@@ -216,7 +217,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.elastic, False)
 
     def test_auto(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
         self.assertEqual(strategy.auto, True)
         strategy.auto = False
@@ -225,7 +226,7 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.auto, False)
 
     def test_strategy_prototxt(self):
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         strategy.localsgd = True
         strategy.dgc = True
@@ -254,7 +255,7 @@ class TestStrategyConfig(unittest.TestCase):
         exe_strategy.num_iteration_per_run = 10
         strategy.execution_strategy = exe_strategy
         strategy.save_to_prototxt("dist_strategy.prototxt")
-        strategy2 = paddle.fleet.DistributedStrategy()
+        strategy2 = paddle.distributed.fleet.DistributedStrategy()
         strategy2.load_from_prototxt("dist_strategy.prototxt")
         self.assertEqual(strategy.dgc, strategy2.dgc)
 
@@ -276,7 +277,7 @@ class TestStrategyConfig(unittest.TestCase):
         build_strategy.enable_backward_optimizer_op_deps = True
         build_strategy.trainers_endpoints = ["1", "2"]
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.build_strategy = build_strategy
 
     def test_execution_strategy(self):
@@ -285,9 +286,36 @@ class TestStrategyConfig(unittest.TestCase):
         exe_strategy.num_iteration_per_drop_scope = 10
         exe_strategy.num_iteration_per_run = 10
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.execution_strategy = exe_strategy
 
+    def test_unknown_strategy(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        with self.assertRaises(TypeError):
+            strategy.unknown_key = 'UNK'
+
+    def test_cudnn_exhaustive_search(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_exhaustive_search = False
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+        strategy.cudnn_exhaustive_search = "True"
+        self.assertEqual(strategy.cudnn_exhaustive_search, False)
+
+    def test_cudnn_batchnorm_spatial_persistent(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.cudnn_batchnorm_spatial_persistent = False
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+        strategy.cudnn_batchnorm_spatial_persistent = "True"
+        self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
+
+    def test_conv_workspace_size_limit(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.conv_workspace_size_limit = 1000
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy.conv_workspace_size_limit = "400"
+        self.assertEqual(strategy.conv_workspace_size_limit, 1000)
+        strategy._enable_env()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
index 36d5912cb7eff23dfde9ef3f12fbc2b782b2ccd3..af72df5186876a8bcbaa5bfa6d71a27fdf46b119 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
@@ -15,8 +15,8 @@
 import unittest
 import paddle
 import os
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
@@ -41,10 +41,10 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.gradient_merge = True
         strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 7998b1fa5d12e4ca3b7da0f71ed295957f86a279..2ac1dfc0303defaadd48d5d7457643405a4f09a0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle
 import os
-from launch_function_helper import launch_func
+from launch_function_helper import launch_func, _find_free_port
 
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
@@ -39,10 +39,8 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         }
 
         def node_func():
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
@@ -57,7 +55,129 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        proc_a = launch_func(node_func, node_a)
+        proc_a.start()
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
+        proc_a.join()
+        proc_b.join()
+
+    def test_graph_execution_optimizer(self):
+
+        port_set = set()
+        port_a = _find_free_port(port_set)
+        port_b = _find_free_port(port_set)
+
+        node_a = {
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        node_b = {
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS":
+            "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        def node_func():
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.nccl_comm_num = 2
+            strategy.sync_nccl_allreduce = True
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+            exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
+            exe.run(paddle.fluid.default_startup_program())
+
+            import numpy as np
+
+            def gen_data():
+                return {
+                    "x": np.random.random(size=(128, 32)).astype('float32'),
+                    "y": np.random.randint(
+                        2, size=(128, 1)).astype('int64')
+                }
+
+            for i in range(10):
+                cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
+                print("cost of step[{}] = {}".format(i, cost_val))
+
+        proc_a = launch_func(node_func, node_a)
+        proc_a.start()
+        proc_b = launch_func(node_func, node_b)
+        proc_b.start()
+        proc_a.join()
+        proc_b.join()
+
+    def test_graph_execution_optimizer_not_apply_v2(self):
+        node_a = {
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        node_b = {
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
+            "http_proxy": "",
+            "https_proxy": ""
+        }
+
+        def node_func():
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
@@ -90,10 +210,8 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         }
 
         def node_func():
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+            import paddle.distributed.fleet as fleet
+            fleet.init(is_collective=True)
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
@@ -108,10 +226,10 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 47e8949922a01855c6d1f1947f0b8b5282da3c48..69f5b134888b0f3268cea112eeefd9fb7fd0127f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -14,6 +14,8 @@
 
 import unittest
 import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 from launch_function_helper import launch_func
 
@@ -39,8 +41,6 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
         }
 
         def node_func():
-            import paddle.fleet as fleet
-            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
             role = role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
             input_x = paddle.fluid.layers.data(
@@ -57,10 +57,10 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-            strategy = paddle.fleet.DistributedStrategy()
+            strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..3f140f53b043b1949572f3728ca8a0c556317783
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetLambMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.lamb = True
+                strategy.lamb_configs = {
+                    'lamb_weight_decay': 0.01,
+                    'exclude_from_weight_decay': [],
+                }
+
+        return avg_cost, strategy
+
+    def test_lamb_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lamb', ops)
+
+    def test_lamb_not_apply_with_momentum(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.1, momentum=0.9)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('lamb', ops)
+
+    def test_lamb_exclude_fn(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        strategy.lamb_configs = {
+            'lamb_weight_decay': 0.01,
+            'exclude_from_weight_decay': ['.b_0'],
+        }
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops_with_bias = [
+            op for op in avg_cost.block.ops
+            if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
+        ]
+        for op in ops_with_bias:
+            self.assertEqual(op.attr('weight_decay'), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 960ffbd4035f9c1891a205cd8afbd1ca581284bd..3caa1a4eac0bf191b13e6708b1a9adffdb111ca7 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -14,9 +14,10 @@
 
 import unittest
 import paddle
+from paddle import fluid
 import os
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
@@ -27,32 +28,42 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
                        "127.0.0.1:36001,127.0.0.2:36001"
 
-    def net(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
-        strategy.lars = True
-        strategy.lars_configs = {
-            "lars_coeff": 0.001,
-            "lars_weight_decay": 0.0005,
-        }
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.lars = True
+                strategy.lars_configs = {
+                    "lars_coeff": 0.001,
+                    "lars_weight_decay": 0.0005,
+                }
 
         return avg_cost, strategy
 
     def test_lars_optimizer(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -60,8 +71,12 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         self.assertIn('lars_momentum', ops)
 
     def test_lars_not_apply_with_adam(self):
-        avg_cost, strategy = self.net()
-        optimizer = paddle.optimizer.Adam(learning_rate=0.01)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index 577f9f6504fd83377f481aeab63b1780d50f6abe..c5edc96963408bf1fad793f7271d75159934f019 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -4,7 +4,22 @@ set -e
 
 function test_launch_ps(){
     fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
+    if grep -q "server are killed" ut.elog; then
+        echo "test pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
+
+    fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
+    if grep -q "server are killed" ut.elog; then
+        echo "test pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
 
+    fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
     if grep -q "server are killed" ut.elog; then
         echo "test pserver launch succeed"
     else
@@ -20,7 +35,7 @@ fi
 
 test_launch_ps
 # use default values
-fleetrun multi_process.py
+fleetrun multi_process.py fleetrun
 
 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -30,16 +45,16 @@ export POD_IP=127.0.0.1
 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 export PADDLE_TRAINER_ID=0
 
-export PADDLE_PORT=35019
+export PADDLE_PORT=35789
 export TRAINER_PORTS_NUM=2
 
 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
 
-str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetrun.check_0.log"
+file_1="multi_process_fleetrun.check_1.log"
 
 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -70,7 +85,7 @@ unset TRAINER_PORTS_NUM
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
     echo "train abort as planned"
 fi
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 1f2ceb298e72edda3da7d4ddd00444208bb21591..07b988bf8752057e68925bc42f564a72d466361d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -16,8 +16,8 @@ import unittest
 import paddle
 import os
 
-import paddle.fleet as fleet
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
@@ -39,14 +39,14 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
         strategy.auto = True
         config = strategy.localsgd_configs
         config['k_steps'] = 1
         strategy.localsgd_configs = config
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
new file mode 100755
index 0000000000000000000000000000000000000000..dfea848aadfc44c57c91c11d196eff49d57cab08
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
+
+
+class TestFleetMetaOptimizerBase(unittest.TestCase):
+    def net(main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+                opt = MetaOptimizerBase(optimizer)
+                opt_ops, params_grads = opt.minimize(avg_cost)
+                opt.apply_optimize(avg_cost,
+                                   paddle.static.default_startup_program(),
+                                   params_grads)
+        return None
+
+    net(fluid.default_startup_program(), fluid.default_main_program())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_metric.py b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
index 2dacc02797a251b894118e170f5a287a848790fc..6a7963f43824f61960e6523ceb5c618f783e8a7a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_metric.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
@@ -19,7 +19,7 @@ import paddle
 import paddle.fluid as fluid
 import os
 import unittest
-import paddle.fleet.metrics.metric as metric
+import paddle.distributed.fleet.metrics.metric as metric
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
index 7b7e3c7c4173fe34368d6f4207491b3800907f57..b2b6136797ba460f9f829d5df4c7041664b424cb 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
@@ -33,7 +33,8 @@ class TestFleet1(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -47,10 +48,10 @@ class TestFleet1(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
index 0005a4a8dbebff04cd9b11d0af082b01c718ca48..adbb1268c6f4d7b21876dacdbbb3cf453a14d0f4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
@@ -19,11 +19,13 @@ import os
 
 class TestFleetMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def test_pipeline_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         with paddle.fluid.device_guard("cpu"):
@@ -47,11 +49,11 @@ class TestFleetMetaOptimizer(unittest.TestCase):
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.pipeline = True
         strategy.pipeline_configs = {'micro_batch': 2}
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
index ec99acf109816570db48d9f15bbbdd897133006a..beec6d7f51c4f54ba5cbf2af255afd6082cedd52 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
@@ -36,7 +36,7 @@ class TestFleetPrivateFunction(unittest.TestCase):
         thr = threading.Thread(target=init_server, args=(9292, ))
         thr.start()
 
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
         ep = ["127.0.0.1:9292"]
         fleet.base.private_helper_function.wait_server_ready(ep)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index f62c8d32d6cfa3bbcc20e9b5f862387f05d475fb..a42010a4eaa5066821adb817e7a5df2b81bedf7c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -26,8 +26,8 @@ class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_recompute_optimizer(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         input_x = paddle.fluid.layers.data(
@@ -41,11 +41,11 @@ class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
-        strategy = paddle.fleet.DistributedStrategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.recompute = True
-        strategy.recompute_configs = {"checkpoints": ["fc2"]}
+        strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
 
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
index 3abad755ac1755ddd62859fae45a14e6aaf528ee..7f1ad5d52d8f0b5a6b5bd83ea3a158f123e870ea 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
@@ -61,7 +61,8 @@ class TestCloudRoleMaker(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -75,10 +76,11 @@ class TestCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #print("init rolemaker")
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index 88a9d235855ce813ad0abc0f304eb0e8adc35ab9..eb5d9eb66608dd397dad773158c337fc67be2dbb 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -14,6 +14,7 @@
 """Test cases for role makers."""
 
 from __future__ import print_function
+import paddle
 import os
 import unittest
 
@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
             data = "1 1 1 1\n"
             f.write(data)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
         dataset.set_use_var([show, label])
         dataset.load_into_memory()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
index 39d3d2a2a042c74f2af0e92dd740a28ef60a5d5d..0fa852eeeebe9c8fbb056fca388a0af2c8f92842 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
@@ -33,7 +33,8 @@ class TestCloudRoleMaker(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -50,10 +51,10 @@ class TestCloudRoleMaker(unittest.TestCase):
             init_timeout_seconds=100,
             run_timeout_seconds=100,
             http_ip_port="127.0.0.1:36003")
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
index dd5cd715ecd1ed9ebc30a22cb924255d278643ed..6414ef18d635aea4b73c17a4931c37e596ed6029 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,10 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
             from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             from paddle.fluid.incubate.fleet.base.role_maker import \
                 GeneralRoleMaker
-            from paddle.fluid.incubate.fleet.utils.http_server import KVHandler
-            from paddle.fluid.incubate.fleet.utils.http_server import KVServer
-            from paddle.fluid.incubate.fleet.utils.http_server import \
-                KVHTTPServer
+            from paddle.distributed.fleet.utils import KVHandler
+            from paddle.distributed.fleet.utils import KVServer
+            from paddle.distributed.fleet.utils import KVHTTPServer
         except:
             print("warning: no fleet, skip test_pslib_4")
             return
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9b3e1e9a1605a714b47d99183511b24c903722
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -0,0 +1,174 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestRoleMakerBase(unittest.TestCase):
+    """
+    Test cases for RoleMakerBase
+    """
+
+    def test_rolemaker_base(self):
+        role = role_maker.RoleMakerBase()
+        self.assertRaises(Exception, role.is_worker)
+        self.assertRaises(Exception, role.is_server)
+        self.assertRaises(Exception, role.is_first_worker)
+        self.assertRaises(Exception, role.worker_num)
+        self.assertRaises(Exception, role.server_num)
+        self.assertRaises(Exception, role.worker_index)
+        self.assertRaises(Exception, role.server_index)
+        self.assertRaises(Exception, role.role_id)
+        self.assertRaises(Exception, role.node_num)
+
+        trainer_endpoints = role.get_trainer_endpoints()
+        self.assertTrue(len(trainer_endpoints) == 0)
+        pserver_endpoints = role.get_pserver_endpoints()
+        self.assertTrue(len(pserver_endpoints) == 0)
+
+        print(role.to_string())
+        self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
+        self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
+        role._barrier(role._node_type_comm)
+
+
+class TestCloudRoleMaker(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMaker.
+    """
+
+    def setUp(self):
+        """Set up, set envs."""
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+    def test_tr_rolemaker(self):
+        """Test tr rolenamer."""
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_tr_rolemaker")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(
+            is_collective=False, init_gloo=False)
+        self.assertTrue(ro.is_worker())
+        self.assertFalse(ro.is_server())
+        self.assertEqual(ro.worker_num(), 2)
+        self.assertTrue(ro.is_first_worker())
+        worker_endpoints = ro.get_trainer_endpoints()
+        self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
+        self.assertEqual(ro.role_id(), 0)
+        self.assertEqual(ro.node_num(), 2)
+
+    def test_tr_rolemaker_collective(self):
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        self.assertEqual(ro.worker_num(), 2)
+        self.assertEqual(ro.node_num(), 2)
+
+    def test_ps_rolemaker(self):
+        """Test ps rolemaker."""
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_ps_rolemaker")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(
+            is_collective=False, init_gloo=False)
+        self.assertEqual(ro.server_index(), 0)
+        self.assertFalse(ro.is_worker())
+        self.assertTrue(ro.is_server())
+        self.assertEqual(ro.server_num(), 2)
+        pserver_endpoints = ro.get_pserver_endpoints()
+        self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
+        self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
+        self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
+
+    def test_traing_role(self):
+        """Test training role."""
+        os.environ["TRAINING_ROLE"] = "TEST"
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_training_role")
+            return
+
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro.generate_role)
+
+
+class TestUserDefinedRoleMaker(unittest.TestCase):
+    """
+    Test cases for UserDefinedRoleMaker.
+    """
+
+    def setUp(self):
+        pass
+
+    def test_ps_rolemaker(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_ps_rolemaker")
+            return
+
+        ro = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            role=role_maker.Role.SERVER,
+            current_id=0,
+            worker_num=2)
+        self.assertEqual(ro.server_num(), 2)
+        ro.generate_role()
+        self.assertTrue(ro.is_server())
+        self.assertEqual(ro.role_id(), 0)
+
+    def test_tr_rolemaker(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_tr_rolemaker")
+            return
+
+        ro = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            role=role_maker.Role.WORKER,
+            current_id=0,
+            worker_num=2)
+        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
+        self.assertTrue(ro.is_worker())
+        self.assertEqual(ro.role_id(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
index 474e5da1c219c4b6e5a35a59ee235fdcbdb34cce..80109716a54e52dc6050b724046561f37020a645 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
@@ -19,21 +19,45 @@ import os
 
 class TestFleetRuntime(unittest.TestCase):
     def test_fleet_runtime_base(self):
-        import paddle.fleet.runtime
-        base = paddle.fleet.runtime.runtime_base.RuntimeBase()
+        import paddle.distributed.fleet.runtime
+        base = paddle.distributed.fleet.runtime.runtime_base.RuntimeBase()
         base._run_worker()
         base._init_server()
         base._run_server()
         base._stop_worker()
+        base._save_inference_model()
+        base._save_persistables()
 
     def test_fleet_collective_runtime(self):
-        import paddle.fleet.runtime
-        collective_runtime = paddle.fleet.runtime.CollectiveRuntime()
+        import paddle.distributed.fleet.runtime
+        collective_runtime = paddle.distributed.fleet.runtime.CollectiveRuntime(
+        )
         collective_runtime._init_worker()
         collective_runtime._run_worker()
         collective_runtime._init_worker()
         collective_runtime._run_server()
         collective_runtime._stop_worker()
+        collective_runtime._save_inference_model()
+        collective_runtime._save_persistables()
+
+    def test_fleet_ps_runtime(self):
+        ps_runtime = paddle.distributed.fleet.runtime.ParameterServerRuntime()
+        self.assertRaises(Exception, ps_runtime._get_optimizer_status,
+                          "test_op", None)
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status("adam",
+                                                                        "param")
+        self.assertTrue(
+            len(reshaped_names) == 2 and
+            reshaped_names[0] == 'param_moment1_0' and
+            reshaped_names[1] == 'param_moment2_0')
+        self.assertTrue(
+            len(origin_names) == 2 and
+            origin_names[0] == 'param_beta1_pow_acc_0' and
+            origin_names[1] == 'param_beta2_pow_acc_0')
+
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status("sgd",
+                                                                        "param")
+        self.assertTrue(len(reshaped_names) == 0 and len(origin_names) == 0)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
index 3b0e8be63d95f29ccd1da145403a7a441698fead..7a255e5da14dacc1a5552642640e3ffe1e4eaad4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_unitaccessor.py
@@ -33,7 +33,8 @@ class TestFleet1(unittest.TestCase):
     def test_pslib_1(self):
         """Test cases for pslib."""
         import paddle.fluid as fluid
-        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
         try:
             import netifaces
@@ -47,10 +48,10 @@ class TestFleet1(unittest.TestCase):
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         role_maker = GeneralRoleMaker()
-        role_maker.generate_role()
+        #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        fleet.init(role_maker)
+        #fleet.init(role_maker)
         train_program = fluid.Program()
         startup_program = fluid.Program()
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index 427e077416e979ad5a77f4744ba6ffdb5064fdff..dde36e073fb20eed3b17c79a886739f59ecb185d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -12,14 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
+from __future__ import print_function
 import paddle
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import tarfile
+import tempfile
 import os
+import sys
+from paddle.dataset.common import download, DATA_HOME
+from paddle.distributed.fleet.base.util_factory import fleet_util
+import paddle.distributed.fleet.base.role_maker as role_maker
 
 
 class TestFleetUtil(unittest.TestCase):
+    proto_data_url = "https://fleet.bj.bcebos.com/fleet_util_data.tgz"
+    proto_data_md5 = "59b7f12fd9dc24b64ae8e4629523a92a"
+    module_name = "fleet_util_data"
+    pruned_dir = os.path.join("fleet_util_data", "pruned_model")
+    train_dir = os.path.join("fleet_util_data", "train_program")
+
     def test_util_base(self):
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
         util = fleet.UtilBase()
         strategy = fleet.DistributedStrategy()
         util._set_strategy(strategy)
@@ -27,7 +42,7 @@ class TestFleetUtil(unittest.TestCase):
         util._set_role_maker(role_maker)
 
     def test_util_factory(self):
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
         factory = fleet.base.util_factory.UtilFactory()
         strategy = fleet.DistributedStrategy()
         role_maker = None  # should be fleet.PaddleCloudRoleMaker()
@@ -40,15 +55,15 @@ class TestFleetUtil(unittest.TestCase):
         self.assertEqual(util.role_maker, None)
 
     def test_get_util(self):
-        import paddle.fleet as fleet
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         default_util = fleet.util
         self.assertEqual(default_util, None)
 
     def test_set_user_defined_util(self):
-        import paddle.fleet as fleet
+        import paddle.distributed.fleet as fleet
 
         class UserDefinedUtil(fleet.UtilBase):
             def __init__(self):
@@ -57,7 +72,7 @@ class TestFleetUtil(unittest.TestCase):
             def get_user_id(self):
                 return 10
 
-        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
@@ -65,6 +80,262 @@ class TestFleetUtil(unittest.TestCase):
         user_id = fleet.util.get_user_id()
         self.assertEqual(user_id, 10)
 
+    def test_fs(self):
+        from paddle.distributed.fleet.utils import LocalFS
+        fs = LocalFS()
+        dirs, files = fs.ls_dir("test_tmp")
+        dirs, files = fs.ls_dir("./")
+        self.assertFalse(fs.need_upload_download())
+        fleet_util.set_file_system(fs)
+
+    def test_barrier(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_barrier")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        fleet_util.barrier("worker")
+
+    def test_all_reduce(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_all_reduce")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        output = fleet_util.all_reduce(1, "sum", comm_world="server")
+        print(output)
+
+    # self.assertEqual(output, 1)
+
+    def test_all_gather(self):
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_all_gather")
+            return
+
+        gloo = fluid.core.Gloo()
+        gloo.set_rank(0)
+        gloo.set_size(1)
+        gloo.set_prefix("123")
+        gloo.set_iface("lo")
+        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
+        gloo.init()
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_endpoints=["127.0.0.1:6003"],
+            server_endpoints=["127.0.0.1:6001"])
+        role._node_type_comm = gloo
+        role._all_comm = gloo
+        role._role_is_generated = True
+        fleet_util._set_role_maker(role)
+
+        output = fleet_util.all_gather(1, comm_world="all")
+        print(output)
+        # self.assertTrue(len(output) == 1 and output[0] == 1)
+        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
+
+    def download_files(self):
+        path = download(self.proto_data_url, self.module_name,
+                        self.proto_data_md5)
+        print('data is downloaded at ' + path)
+        tar = tarfile.open(path)
+        unzip_folder = tempfile.mkdtemp()
+        tar.extractall(unzip_folder)
+        return unzip_folder
+
+    def test_get_file_shard(self):
+        self.assertRaises(Exception, fleet_util.get_file_shard, "files")
+        try:
+            import netifaces
+        except:
+            print("warning: no netifaces, skip test_get_file_shard")
+            return
+
+        role = role_maker.UserDefinedRoleMaker(
+            is_collective=False,
+            init_gloo=False,
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+        fleet_util._set_role_maker(role)
+        files = fleet_util.get_file_shard(["1", "2", "3"])
+        self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
+
+    def test_program_type_trans(self):
+        data_dir = self.download_files()
+        program_dir = os.path.join(data_dir, self.pruned_dir)
+        text_program = "pruned_main_program.pbtxt"
+        binary_program = "pruned_main_program.bin"
+        text_to_binary = fleet_util._program_type_trans(program_dir,
+                                                        text_program, True)
+        binary_to_text = fleet_util._program_type_trans(program_dir,
+                                                        binary_program, False)
+        self.assertTrue(
+            os.path.exists(os.path.join(program_dir, text_to_binary)))
+        self.assertTrue(
+            os.path.exists(os.path.join(program_dir, binary_to_text)))
+
+    def test_prams_check(self):
+        data_dir = self.download_files()
+
+        class config:
+            pass
+
+        feed_config = config()
+        feed_config.feeded_vars_names = ['concat_1.tmp_0', 'concat_2.tmp_0']
+        feed_config.feeded_vars_dims = [682, 1199]
+        feed_config.feeded_vars_types = [np.float32, np.float32]
+        feed_config.feeded_vars_filelist = [
+            os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_1")),
+            os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_2"))
+        ]
+
+        fetch_config = config()
+        fetch_config.fetch_vars_names = ['similarity_norm.tmp_0']
+
+        conf = config()
+        conf.batch_size = 1
+        conf.feed_config = feed_config
+        conf.fetch_config = fetch_config
+        conf.dump_model_dir = os.path.join(data_dir, self.pruned_dir)
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        conf.is_text_dump_program = True
+        conf.save_params_filename = None
+
+        # test saved var's shape
+        conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match"
+
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        # test program.proto without feed_op and fetch_op
+        conf.dump_program_filename = "pruned_main_program.no_feed_fetch"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+        np.testing.assert_array_almost_equal(
+            results[0], np.array(
+                [[3.0590223e-07]], dtype=np.float32))
+
+        # test feed_var's shape
+        conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        # test correct case with feed_vars_filelist
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+        np.testing.assert_array_almost_equal(
+            results[0], np.array(
+                [[3.0590223e-07]], dtype=np.float32))
+
+        # test correct case without feed_vars_filelist
+        conf.feed_config.feeded_vars_filelist = None
+        # test feed var with lod_level >= 2
+        conf.dump_program_filename = "pruned_main_program.feed_lod2"
+        self.assertRaises(Exception, fleet_util._params_check)
+
+        conf.dump_program_filename = "pruned_main_program.pbtxt"
+        results = fleet_util._params_check(conf)
+        self.assertTrue(len(results) == 1)
+
+    def test_proto_check(self):
+        data_dir = self.download_files()
+
+        class config:
+            pass
+
+        conf = config()
+        conf.train_prog_path = os.path.join(
+            data_dir, os.path.join(self.train_dir, "join_main_program.pbtxt"))
+        conf.is_text_train_program = True
+
+        # test not match
+        conf.pruned_prog_path = os.path.join(
+            data_dir,
+            os.path.join(self.pruned_dir,
+                         "pruned_main_program.save_var_shape_not_match"))
+        conf.is_text_pruned_program = True
+        conf.draw = False
+        res = fleet_util._proto_check(conf)
+        self.assertFalse(res)
+
+        # test match
+        conf.pruned_prog_path = os.path.join(
+            data_dir,
+            os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
+        if sys.platform == 'win32' or sys.platform == 'sys.platform':
+            conf.draw = False
+        else:
+            conf.draw = True
+            conf.draw_out_name = "pruned_check"
+        res = fleet_util._proto_check(conf)
+        self.assertTrue(res)
+
+    def test_visualize(self):
+        if sys.platform == 'win32' or sys.platform == 'sys.platform':
+            pass
+        else:
+            data_dir = self.download_files()
+            program_path = os.path.join(
+                data_dir,
+                os.path.join(self.train_dir, "join_main_program.pbtxt"))
+            is_text = True
+            program = fleet_util._load_program(program_path, is_text)
+            output_dir = os.path.join(data_dir, self.train_dir)
+            output_filename = "draw_prog"
+            fleet_util._visualize_graphviz(program, output_dir, output_filename)
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(output_dir, output_filename + ".dot")))
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(output_dir, output_filename + ".pdf")))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index 0d87b94538f05d734cb3e621fc0dfc7c48e8fea2..c01876531c99c610706265ff646d93c4a197a26e 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -15,14 +15,12 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 import inspect
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS, FS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 
 class FSTest(unittest.TestCase):
@@ -40,6 +38,8 @@ class FSTest(unittest.TestCase):
                 func(a)
             elif len(args) == 3:
                 func(a, a)
+            elif len(args) == 5:
+                func(a, a, a, a)
             print("args:", args, len(args), "func:", func)
             self.assertFalse(True)
         except NotImplementedError as e:
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index 21cbab193419be9413c487c8631671097016d959..ba14aeae990329915e080969ca74b8a9658632e9 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid.core as core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 import paddle.compat as cpt
 import unittest
 import numpy as np
@@ -38,7 +38,7 @@ class TestFullOp(unittest.TestCase):
             place = paddle.CPUPlace()
             if core.is_compiled_with_cuda():
                 place = paddle.CUDAPlace(0)
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             exe.run(startup_program)
 
             img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
@@ -53,12 +53,13 @@ class TestFullOp(unittest.TestCase):
                 msg="full_like output is wrong, out = " + str(out_np))
 
     def test_full_like_imperative(self):
-        with paddle.imperative.guard():
-            input = paddle.arange(6, 10, dtype='float32')
-            out = paddle.full_like(input, fill_value=888.88, dtype='float32')
-            out_numpy = np.random.random((4)).astype("float32")
-            out_numpy.fill(888.88)
-            self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.disable_static()
+        input = paddle.arange(6, 10, dtype='float32')
+        out = paddle.full_like(input, fill_value=888.88, dtype='float32')
+        out_numpy = np.random.random((4)).astype("float32")
+        out_numpy.fill(888.88)
+        self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.enable_static()
 
 
 class TestFullOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index c43454eaaee9e3b2f9aa371453e58b009c99a52c..68be0bf5d561ef0d8fe92005dd9ddb47c21aca51 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -37,7 +37,6 @@ class TestFunctionalConv2D(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def prepare(self):
@@ -88,7 +87,6 @@ class TestFunctionalConv2D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -121,9 +119,11 @@ class TestFunctionalConv2D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -144,10 +144,12 @@ class TestFunctionalConv2D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+
             out = y.numpy()
         return out
 
@@ -185,7 +187,6 @@ class TestFunctionalConv2DError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def test_exception(self):
@@ -228,9 +229,7 @@ class TestFunctionalConv2DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
@@ -383,21 +382,6 @@ class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
         self.data_format = "NCHW"
 
 
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCHW"
-
-
 class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
     def setUp(self):
         self.in_channels = 3
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index 21986f1b98d869289ddb34a65316aca57c83f9d9..1fb07bf4345909deb5485a89232270336658ae8b 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -37,8 +37,6 @@ class TestFunctionalConv2D(TestCase):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def prepare(self):
@@ -90,8 +88,6 @@ class TestFunctionalConv2D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
@@ -115,7 +111,7 @@ class TestFunctionalConv2D(TestCase):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
+                y = F.conv_transpose2d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -124,9 +120,7 @@ class TestFunctionalConv2D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -140,7 +134,7 @@ class TestFunctionalConv2D(TestCase):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d_transpose(
+            y = F.conv_transpose2d(
                 x,
                 weight,
                 bias,
@@ -148,10 +142,8 @@ class TestFunctionalConv2D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
             out = y.numpy()
         return out
 
@@ -189,8 +181,6 @@ class TestFunctionalConv2DError(TestCase):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
     def test_exception(self):
@@ -225,7 +215,7 @@ class TestFunctionalConv2DError(TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
+                y = F.conv_transpose2d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -234,9 +224,7 @@ class TestFunctionalConv2DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
@@ -249,8 +237,6 @@ class TestFunctionalConv2DCase2(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -264,8 +250,6 @@ class TestFunctionalConv2DCase3(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = True
-        self.act = None
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -279,8 +263,6 @@ class TestFunctionalConv2DCase4(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -294,8 +276,6 @@ class TestFunctionalConv2DCase5(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -309,8 +289,6 @@ class TestFunctionalConv2DCase6(TestFunctionalConv2D):
         self.dilation = (2, 1)
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -324,8 +302,6 @@ class TestFunctionalConv2DCase7(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 4
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NHWC"
 
 
@@ -340,8 +316,6 @@ class TestFunctionalConv2DCase8(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -355,8 +329,6 @@ class TestFunctionalConv2DCase9(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -370,8 +342,6 @@ class TestFunctionalConv2DCase10(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -385,8 +355,6 @@ class TestFunctionalConv2DCase11(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -400,8 +368,6 @@ class TestFunctionalConv2DCase12(TestFunctionalConv2D):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -415,8 +381,6 @@ class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -430,8 +394,6 @@ class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NHWC"
 
 
@@ -445,8 +407,6 @@ class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -460,23 +420,6 @@ class TestFunctionalConv2DErrorCase5(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCHW"
 
 
@@ -491,8 +434,6 @@ class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
@@ -506,8 +447,6 @@ class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 1
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -521,8 +460,6 @@ class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
         self.dilation = 1
         self.groups = 2
         self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCHW"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 195e3812f94843f6ccdd05cbc317238765e4c06b..b413a56c07a9ce3afbe15baffbffaf92a3d42129 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -37,7 +37,6 @@ class TestFunctionalConv3D(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def prepare(self):
@@ -88,7 +87,6 @@ class TestFunctionalConv3D(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -121,9 +119,11 @@ class TestFunctionalConv3D(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -144,10 +144,12 @@ class TestFunctionalConv3D(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+
             out = y.numpy()
         return out
 
@@ -185,7 +187,6 @@ class TestFunctionalConv3DError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def test_exception(self):
@@ -228,9 +229,10 @@ class TestFunctionalConv3DError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DCase2(TestFunctionalConv3D):
@@ -244,7 +246,6 @@ class TestFunctionalConv3DCase2(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -259,7 +260,6 @@ class TestFunctionalConv3DCase3(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -274,7 +274,6 @@ class TestFunctionalConv3DCase4(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -289,7 +288,6 @@ class TestFunctionalConv3DCase5(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -304,7 +302,6 @@ class TestFunctionalConv3DCase6(TestFunctionalConv3D):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -319,7 +316,6 @@ class TestFunctionalConv3DCase7(TestFunctionalConv3D):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -349,7 +345,6 @@ class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NCDHW"
 
 
@@ -364,7 +359,6 @@ class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "not_valid"
 
 
@@ -379,22 +373,6 @@ class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase6(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCDHW"
 
 
@@ -409,7 +387,6 @@ class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -424,7 +401,6 @@ class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -439,7 +415,6 @@ class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NCDHW"
 
 
@@ -454,7 +429,6 @@ class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NDHWC"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index f8e7818315fa077df4d8ad0d6d3f76b47501b5e9..7441f7cb915e8b1fdd2155fff79e145fb6a00c0f 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -38,7 +38,6 @@ class TestFunctionalConv3DTranspose(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def prepare(self):
@@ -90,7 +89,6 @@ class TestFunctionalConv3DTranspose(TestCase):
                     param_attr=I.NumpyArrayInitializer(self.weight),
                     bias_attr=False
                     if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
                     act=self.act,
                     data_format=self.data_format)
         exe = fluid.Executor(self.place)
@@ -115,7 +113,7 @@ class TestFunctionalConv3DTranspose(TestCase):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
+                y = F.conv_transpose3d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -124,9 +122,9 @@ class TestFunctionalConv3DTranspose(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -140,7 +138,7 @@ class TestFunctionalConv3DTranspose(TestCase):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d_transpose(
+            y = F.conv_transpose3d(
                 x,
                 weight,
                 bias,
@@ -148,10 +146,10 @@ class TestFunctionalConv3DTranspose(TestCase):
                 padding=self.padding,
                 stride=self.stride,
                 dilation=self.dilation,
-                act=self.act,
                 groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
+                data_format=self.data_format)
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
             out = y.numpy()
         return out
 
@@ -190,7 +188,6 @@ class TestFunctionalConv3DTransposeError(TestCase):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
     def test_exception(self):
@@ -225,7 +222,7 @@ class TestFunctionalConv3DTransposeError(TestCase):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
+                y = F.conv_transpose3d(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -234,9 +231,9 @@ class TestFunctionalConv3DTransposeError(TestCase):
                     stride=self.stride,
                     dilation=self.dilation,
                     groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
+                    data_format=self.data_format)
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
@@ -250,7 +247,6 @@ class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -265,7 +261,6 @@ class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -280,7 +275,6 @@ class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = True
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -295,7 +289,6 @@ class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -310,7 +303,6 @@ class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
         self.groups = 4
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = False
         self.data_format = "NDHWC"
 
 
@@ -326,7 +318,6 @@ class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -341,7 +332,6 @@ class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -356,7 +346,6 @@ class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -371,7 +360,6 @@ class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -386,7 +374,6 @@ class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -402,7 +389,6 @@ class TestFunctionalConv3DTransposeErrorCase2(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -418,7 +404,6 @@ class TestFunctionalConv3DTransposeErrorCase3(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NDHWC"
 
 
@@ -434,7 +419,6 @@ class TestFunctionalConv3DTransposeErrorCase4(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -450,23 +434,6 @@ class TestFunctionalConv3DTransposeErrorCase5(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeErrorCase6(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
         self.data_format = "NCDHW"
 
 
@@ -483,7 +450,6 @@ class TestFunctionalConv3DTransposeErrorCase7(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
@@ -499,7 +465,6 @@ class TestFunctionalConv3DTransposeErrorCase8(
         self.groups = 1
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "not_valid"
 
 
@@ -515,7 +480,6 @@ class TestFunctionalConv3DTransposeErrorCase9(
         self.groups = 2
         self.no_bias = False
         self.act = "sigmoid"
-        self.use_cudnn = True
         self.data_format = "NCDHW"
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index fb7454542587323a8775b066646bb1cd1c79c9ec..d8a5816a42a2fd03ecfaa11f22b602f89a422cda 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -30,6 +30,7 @@ def fusion_gru(
         wh,  # D x 3D
         bias,  # 1 x 3D
         is_reverse,
+        origin_mode,
         act_state,
         act_gate):
     return gru(fc(x, wx, bias),
@@ -40,7 +41,8 @@ def fusion_gru(
                    (1, wh.shape[1]), dtype='float32'),
                is_reverse,
                act_state,
-               act_gate)
+               act_gate,
+               origin_mode=origin_mode)
 
 
 class TestFusionGRUOp(OpTest):
@@ -57,6 +59,8 @@ class TestFusionGRUOp(OpTest):
         self.with_bias = True
         self.act_state = 'tanh'
         self.act_gate = 'sigmoid'
+        self.origin_mode = False
+        self.use_mkldnn = False
         self.set_confs()
 
         T = sum(self.lod[0])
@@ -73,7 +77,7 @@ class TestFusionGRUOp(OpTest):
                 (N, self.D), dtype='float32')
 
         _, _, _, hidden = fusion_gru(
-            x, self.lod, h0, wx, wh, bias, self.is_reverse,
+            x, self.lod, h0, wx, wh, bias, self.is_reverse, self.origin_mode,
             ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
 
         self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh}
@@ -89,7 +93,9 @@ class TestFusionGRUOp(OpTest):
         self.attrs = {
             'activation': self.act_state,
             'gate_activation': self.act_gate,
-            'is_reverse': self.is_reverse
+            'is_reverse': self.is_reverse,
+            'origin_mode': self.origin_mode,
+            'use_mkldnn': self.use_mkldnn
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index 892f63bf15b742c51ddbc15262f888e43cdd03f3..bd934c76ebfa2ed7c9b11223b34c812e605ebe18 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -18,12 +18,11 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 class TestGatherNdOpWithEmptyIndex(OpTest):
-    """
-    Index has empty element, which means copy entire tensor
-    """
+    #Index has empty element, which means copy entire tensor
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -40,10 +39,22 @@ class TestGatherNdOpWithEmptyIndex(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestGatherNdOpWithIndex1(OpTest):
+    def setUp(self):
+        self.op_type = "gather_nd"
+        xnp = np.random.random((5, 20)).astype("float64")
+        self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestGatherNdOpWithLowIndex(OpTest):
-    """
-    Index has low rank, X has high rank
-    """
+    #Index has low rank, X has high rank
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -61,10 +72,27 @@ class TestGatherNdOpWithLowIndex(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestGatherNdOpIndex1(OpTest):
+    #Index has low rank, X has high rank
+
+    def setUp(self):
+        self.op_type = "gather_nd"
+        xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
+        index = np.array([1, 2]).astype("int64")
+
+        self.inputs = {'X': xnp, 'Index': index}
+
+        self.outputs = {'Out': xnp[tuple(index.T)]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestGatherNdOpWithSameIndexAsX(OpTest):
-    """
-    Index has same rank as X's rank
-    """
+    #Index has same rank as X's rank
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -82,9 +110,7 @@ class TestGatherNdOpWithSameIndexAsX(OpTest):
 
 
 class TestGatherNdOpWithHighRankSame(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) = Rank(X)
-    """
+    #Both Index and X have high rank, and Rank(Index) = Rank(X)
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -103,9 +129,7 @@ class TestGatherNdOpWithHighRankSame(OpTest):
 
 
 class TestGatherNdOpWithHighRankDiff(OpTest):
-    """
-    Both Index and X have high rank, and Rank(Index) < Rank(X)
-    """
+    #Both Index and X have high rank, and Rank(Index) < Rank(X)
 
     def setUp(self):
         self.op_type = "gather_nd"
@@ -162,5 +186,63 @@ class TestGatherNdOpRaise(unittest.TestCase):
         self.assertRaises(IndexError, check_raise_is_test)
 
 
+class TestGatherNdError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='float32', name='x')
+            index = paddle.data(shape=shape, dtype='bool', name='index')
+            index_float = paddle.data(
+                shape=shape, dtype='float32', name='index_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_index = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.gather_nd(np_x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather_nd(x, np_index)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_index_dtype():
+                paddle.gather_nd(x, index_float)
+
+            self.assertRaises(TypeError, test_index_dtype)
+
+
+class TestGatherNdAPI2(unittest.TestCase):
+    def test_static(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.gather_nd(data1, index)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]])
+            index_1 = np.array([[1]])
+            result, = exe.run(feed={"data1": input,
+                                    "index": index_1},
+                              fetch_list=[out])
+            expected_output = np.array([[3, 4]])
+        self.assertTrue(np.allclose(result, expected_output))
+
+    def test_imperative(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([[1]])
+        input = fluid.dygraph.to_variable(input_1)
+        index = fluid.dygraph.to_variable(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([3, 4])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index f8763e731eeed3b36a6271167a57b9277479b5ba..1f6e522d2668b5dcd2075ff7af6b4b1ee674632d 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -21,6 +21,13 @@ import paddle
 import paddle.fluid as fluid
 
 
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
@@ -108,12 +115,80 @@ class TestCase6(TestGatherOp):
         self.index_type = "int32"
 
 
+class TestGatherOp1(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        axis_np = np.array(self.axis).astype(self.index_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        out = gather_numpy(xnp, index_np, axis_np[0])
+        self.inputs = {'X': xnp, 'Index': index_np, 'Axis': axis_np}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
+class TestGatherOp2(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+
+class TestGatherOp3(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [2]
+        self.axis_type = "int32"
+
+
+class TestGatherOp4(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 100, 10)
+        self.x_type = "float64"
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+
 class API_TestGather(unittest.TestCase):
-    def test_out(self):
+    def test_out1(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
-            index = fluid.layers.data('index', shape=[-1, 1], dtype='float64')
-            out = paddle.gather(data1, index)
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.fluid.layers.gather(data1, index)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input = np.array([[1, 2], [3, 4], [5, 6]])
@@ -124,18 +199,103 @@ class API_TestGather(unittest.TestCase):
             expected_output = np.array([[3, 4], [5, 6]])
         self.assertTrue(np.allclose(result, expected_output))
 
+    def test_out2(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data('x', shape=[-1, 2], dtype='float64')
+            index = paddle.data('index', shape=[-1, 1], dtype='int32')
+            axis = paddle.data('axis', shape=[1], dtype='int32')
+            out = paddle.gather(x, index, axis)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float64')
+            index_np = np.array([1, 1]).astype('int32')
+            axis_np = np.array([1]).astype('int32')
+            result, = exe.run(
+                feed={"x": x_np,
+                      "index": index_np,
+                      'axis': axis_np},
+                fetch_list=[out])
+            expected_output = gather_numpy(x_np, index_np, axis_np)
+        self.assertTrue(np.allclose(result, expected_output))
+
 
 class API_TestDygraphGather(unittest.TestCase):
-    def test_out(self):
-        with fluid.dygraph.guard():
-            input_1 = np.array([[1, 2], [3, 4], [5, 6]])
-            index_1 = np.array([1, 2])
-            input = fluid.dygraph.to_variable(input_1)
-            index = fluid.dygraph.to_variable(index_1)
-            output = paddle.fluid.layers.gather(input, index)
-            output_np = output.numpy()
-            expected_output = np.array([[3, 4], [5, 6]])
+    def test_out1(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([1, 2])
+        input = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([[3, 4], [5, 6]])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+    def test_out12(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([1, 2])
+        x = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.gather(x, index, axis=0)
+        output_np = output.numpy()
+        expected_output = gather_numpy(input_1, index_1, axis=0)
         self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='int8', name='x')
+            axis = paddle.data(shape=[1], dtype='float32', name='axis')
+            index = paddle.data(shape=shape, dtype='int32', name='index')
+            index_float = paddle.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=axis)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+    def test_error2(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+
+            shape = [8, 9, 6]
+            x = fluid.data(shape=shape, dtype='int8', name='x')
+            index = fluid.data(shape=shape, dtype='int32', name='mask')
+            index_float = fluid.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.fluid.layers.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.fluid.layers.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index a5d36203b0ad567a4ba25d686652c9384ea424bf..5054256ca72477785076cdff69266160f6c7d640 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -224,7 +224,8 @@ def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
 
 class TestGenerateProposalLabelsOp(OpTest):
     def set_data(self):
-        self.use_random = False
+        #self.use_random = False
+        self.init_use_random()
         self.init_test_cascade()
         self.init_test_params()
         self.init_test_input()
@@ -267,6 +268,9 @@ class TestGenerateProposalLabelsOp(OpTest):
     def init_test_cascade(self, ):
         self.is_cascade_rcnn = False
 
+    def init_use_random(self):
+        self.use_random = False
+
     def init_test_params(self):
         self.batch_size_per_im = 512
         self.fg_fraction = 0.25
@@ -329,6 +333,28 @@ class TestCascade(TestGenerateProposalLabelsOp):
         self.is_cascade_rcnn = True
 
 
+class TestUseRandom(TestGenerateProposalLabelsOp):
+    def init_use_random(self):
+        self.use_random = True
+        self.is_cascade_rcnn = False
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_out)
+
+    def verify_out(self, outs):
+        print("skip")
+
+    def init_test_params(self):
+        self.batch_size_per_im = 512
+        self.fg_fraction = 0.025
+        self.fg_thresh = 0.5
+        self.bg_thresh_hi = 0.5
+        self.bg_thresh_lo = 0.0
+        self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
+        self.is_cls_agnostic = False
+        self.class_nums = 2 if self.is_cls_agnostic else 81
+
+
 class TestClsAgnostic(TestCascade):
     def init_test_params(self):
         self.batch_size_per_im = 512
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc43d3d5498284e8a24dd272eaed08cdf830733
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+import time  # temp for debug
+
+
+class TestGenerator(unittest.TestCase):
+    """
+    Test cases for cpu generator.
+    """
+
+    def test_basic_generator(self):
+        """Test basic generator."""
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+        s = gen.initial_seed()
+        s = gen.seed()
+        st = gen.get_state()
+        gen.set_state(st)
+        gen.random()
+        gen.set_cpu_engine(gen.get_cpu_engine())
+
+    def test_basic_generator_error(self):
+        self.assertRaises(ValueError, generator.Generator, device="CUDA")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 6660bfb0c747300741b305a101734e1ef808eeb5..4f0beb8c0dcd5384e7b9f6e30e8082595ac4dc06 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -124,14 +124,8 @@ class TestBase(unittest.TestCase):
                             label = item['label']
                             assert image.shape() == [BATCH_SIZE, 784]
                             assert label.shape() == [BATCH_SIZE, 1]
-                            if ps[i]._equals(fluid.CPUPlace()):
-                                assert image._place()._equals(fluid.CPUPlace())
-                                assert label._place()._equals(fluid.CPUPlace())
-                            else:
-                                assert image._place()._equals(
-                                    fluid.CUDAPinnedPlace())
-                                assert label._place()._equals(
-                                    fluid.CUDAPinnedPlace())
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
                         L, = exe.run(program=prog,
                                      feed=d,
                                      fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a33f32a0b6977716d8065419f8e0f88d6c4f44a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import fluid, nn
+import paddle.fluid.dygraph as dg
+import paddle.nn.functional as F
+import unittest
+
+
+class GridSampleTestCase(unittest.TestCase):
+    def __init__(self,
+                 methodName='runTest',
+                 x_shape=[2, 2, 3, 3],
+                 grid_shape=[2, 3, 3, 2],
+                 mode="bilinear",
+                 padding_mode="zeros",
+                 align_corners=False):
+        super(GridSampleTestCase, self).__init__(methodName)
+        self.padding_mode = padding_mode
+        self.x_shape = x_shape
+        self.grid_shape = grid_shape
+        self.mode = mode
+        self.padding_mode = padding_mode
+        self.align_corners = align_corners
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.x = np.random.randn(*(self.x_shape)).astype(self.dtype)
+        self.grid = np.random.uniform(-1, 1, self.grid_shape).astype(self.dtype)
+
+    def static_functional(self, place):
+        main = fluid.Program()
+        start = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, start):
+                x = fluid.data("x", self.x_shape, dtype=self.dtype)
+                grid = fluid.data("grid", self.grid_shape, dtype=self.dtype)
+                y_var = F.grid_sample(
+                    x,
+                    grid,
+                    mode=self.mode,
+                    padding_mode=self.padding_mode,
+                    align_corners=self.align_corners)
+        feed_dict = {"x": self.x, "grid": self.grid}
+        exe = fluid.Executor(place)
+        exe.run(start)
+        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def dynamic_functional(self):
+        x_t = paddle.to_tensor(self.x)
+        grid_t = paddle.to_tensor(self.grid)
+        y_t = F.grid_sample(
+            x_t,
+            grid_t,
+            mode=self.mode,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners)
+        y_np = y_t.numpy()
+        return y_np
+
+    def _test_equivalence(self, place):
+        result1 = self.static_functional(place)
+        with dg.guard(place):
+            result2 = self.dynamic_functional()
+        np.testing.assert_array_almost_equal(result1, result2)
+
+    def runTest(self):
+        place = fluid.CPUPlace()
+        self._test_equivalence(place)
+
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            self._test_equivalence(place)
+
+
+class GridSampleErrorTestCase(GridSampleTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with self.assertRaises(ValueError):
+            self.static_functional(place)
+
+
+def add_cases(suite):
+    suite.addTest(GridSampleTestCase(methodName='runTest'))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='reflect',
+            align_corners=True))
+    suite.addTest(
+        GridSampleTestCase(
+            methodName='runTest',
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True))
+
+
+def add_error_cases(suite):
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', padding_mode="VALID"))
+    suite.addTest(
+        GridSampleErrorTestCase(
+            methodName='runTest', align_corners="VALID"))
+    suite.addTest(GridSampleErrorTestCase(methodName='runTest', mode="VALID"))
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index bd5a07769e30de5110566f630de2d480e3426c77..4d1ed5aeb96ebbe064e35c1bee9d5775812440f7 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -17,17 +17,17 @@ import numpy as np
 from op_test import OpTest
 
 
-def AffineGrid(theta, size):
-    n = size[0]
-    h = size[2]
-    w = size[3]
+def AffineGrid(theta, grid_shape):
+    n = grid_shape[0]
+    h = grid_shape[1]
+    w = grid_shape[2]
     h_idx = np.repeat(
         np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
     w_idx = np.repeat(
         np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
     grid = np.concatenate(
         [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
-    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3
 
     ret = np.zeros([n, h * w, 2])
     theta = theta.transpose([0, 2, 1])
@@ -40,15 +40,19 @@ def AffineGrid(theta, size):
 def getGridPointValue(data, x, y):
     data_shape = data.shape
     N = data_shape[0]
-    H = data_shape[2]
-    W = data_shape[3]
-
-    out = np.zeros(data_shape, dtype='float64')
+    C = data_shape[1]
+    in_H = data_shape[2]
+    in_W = data_shape[3]
+    out_H = x.shape[1]
+    out_W = x.shape[2]
+
+    #out = np.zeros(data_shape, dtype='float64')
+    out = np.zeros([N, C, out_H, out_W], dtype='float64')
     for i in range(N):
-        for j in range(H):
-            for k in range(W):
-                if y[i, j, k] < 0 or y[i, j, k] > H - 1 or x[i, j, k] < 0 or x[
-                        i, j, k] > W - 1:
+        for j in range(out_H):
+            for k in range(out_W):
+                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
+                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
                     out[i, :, j, k] = 0
                 else:
                     out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -56,44 +60,89 @@ def getGridPointValue(data, x, y):
     return out
 
 
-def GridSampler(data, grid):
-    dims = data.shape
-    N = dims[0]
-    C = dims[1]
-    H = dims[2]
-    W = dims[3]
+def clip(x, min_n, max_n):
+    return np.maximum(np.minimum(x, max_n), min_n)
 
-    x = grid[:, :, :, 0]
-    y = grid[:, :, :, 1]
-    y_max = H - 1
-    x_max = W - 1
 
-    x = 0.5 * ((x.astype('float64') + 1.0) * x_max)
-    y = 0.5 * ((y.astype('float64') + 1.0) * y_max)
+def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
+    if align_corners:
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val)
+    else:
+        grid_slice = 0.5 * (
+            (grid_slice.astype('float64') + 1.0) * (max_val + 1)) - 0.5
+
+    if padding_mode == "border":
+        grid_slice = clip(grid_slice, 0, max_val)
+    elif padding_mode == "reflect":
+        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
+        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
+                                                                   0.5)
+        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
+        grid_slice = np.minimum(extra, double_range - extra)
+        grid_slice = grid_slice if align_corners else clip(grid_slice - 0.5, 0,
+                                                           max_val)
+    return grid_slice
 
-    x0 = np.floor(x).astype('int32')
-    x1 = x0 + 1
-    y0 = np.floor(y).astype('int32')
-    y1 = y0 + 1
 
-    wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
-    wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
+def GridSampler(data,
+                grid,
+                align_corners=True,
+                mode="bilinear",
+                padding_mode="zeros"):
+    dims = data.shape
+    N = dims[0]
+    in_C = dims[1]
+    in_H = dims[2]
+    in_W = dims[3]
 
-    va = getGridPointValue(data, x0, y0)
-    vb = getGridPointValue(data, x0, y1)
-    vc = getGridPointValue(data, x1, y0)
-    vd = getGridPointValue(data, x1, y1)
+    out_H = grid.shape[1]
+    out_W = grid.shape[2]
 
-    out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    y_max = in_H - 1
+    x_max = in_W - 1
+
+    x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
+    y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
+
+    if mode == "bilinear":
+        x0 = np.floor(x).astype('int32')
+        x1 = x0 + 1
+        y0 = np.floor(y).astype('int32')
+        y1 = y0 + 1
+
+        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+
+        va = getGridPointValue(data, x0, y0)
+        vb = getGridPointValue(data, x0, y1)
+        vc = getGridPointValue(data, x1, y0)
+        vd = getGridPointValue(data, x1, y1)
+
+        out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    elif mode == "nearest":
+        x = np.round(x).astype('int32')
+        y = np.round(y).astype('int32')
+        out = getGridPointValue(data, x, y)
     return out
 
 
 class TestGridSamplerOp(OpTest):
     def setUp(self):
-        self.initTestCase()
+        self.use_cudnn = False
+        self.numeric_grad_delta = 0.0001
         self.op_type = 'grid_sampler'
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.initTestCase()
         x = np.random.randint(0, 255, self.x_shape).astype('float64')
 
         theta = np.zeros(self.theta_shape).astype('float64')
@@ -101,22 +150,90 @@ class TestGridSamplerOp(OpTest):
             for j in range(2):
                 for k in range(3):
                     theta[i, j, k] = np.random.rand(1)[0]
-        grid = AffineGrid(theta, self.x_shape)
+        grid = AffineGrid(theta, self.grid_shape)
 
         self.inputs = {'X': x, 'Grid': grid}
-        self.attrs = {'use_cudnn': True}
-        self.outputs = {'Output': GridSampler(x, grid)}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            "align_corners": self.align_corners,
+            "padding_mode": self.padding_mode,
+            "mode": self.mode
+        }
+        #    print("X: {}".format(x))
+        self.outputs = {
+            'Output': GridSampler(x, grid, self.align_corners, self.mode,
+                                  self.padding_mode)
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
+        self.check_grad(
+            ['X', 'Grid'],
+            'Output',
+            max_relative_error=0.01,
+            numeric_grad_delta=self.numeric_grad_delta)
+
+    def initTestCase(self):
+        self.x_shape = (2, 3, 8, 8)
+        self.grid_shape = (2, 7, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = True
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+
+
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "border"
+        self.mode = "bilinear"
+
+
+class Case2(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
+
+class Case3(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "reflect"
+        self.mode = "bilinear"
+
 
+class Case4(TestGridSamplerOp):
     def initTestCase(self):
-        self.x_shape = (2, 5, 7, 3)
-        self.grid_shape = (2, 7, 3, 2)
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflect"
+        self.mode = "nearest"
+        self.numeric_grad_delta = 0.0001
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..654e8d6f129e1ffe0dce59113ca88a16d348f210
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphGroupNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
new file mode 100644
index 0000000000000000000000000000000000000000..430ed1abe860869d791f0eac17accc8416db1eca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest1(FSTestBase):
+    def test_timeout(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        src = "hdfs_test_timeout"
+        dst = "new_hdfs_test_timeout"
+        fs.delete(dst)
+        fs.mkdirs(src)
+        fs.mkdirs(dst)
+        fs.mkdirs(dst + "/" + src)
+        output = ""
+        try:
+            fs.mv(src, dst, test_exists=False)
+            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
+                                                                        output))
+        except FSTimeOut as e:
+            print("execute mv {} to {} timeout".format(src, dst))
+
+        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
+        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
+        self.assertNotEqual(ret, 0)
+        print("second mv ret:{} output:{}".format(ret, output))
+
+    def test_is_dir(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        self.assertFalse(fs.is_dir("./test_hdfs.py"))
+        s = """
+java.io.IOException: Input/output error
+ responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
+	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
+	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
+	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
+	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
+	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
+	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
+	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
+	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
+        """
+
+        print("split lines:", s.splitlines())
+        self.assertTrue(fs._test_match(s.splitlines()) != None)
+
+    def test_config(self):
+        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            config,
+            time_out=6 * 1000,
+            sleep_inter=100)
+
+    def test_exists(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=100)
+        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
+        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
+        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
+        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs1.py"))
+        self.assertTrue(dirs == [])
+        self.assertTrue(len(files) == 1)
+        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7754f89e3c901ac14cb102881e8d338442038559
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest2(FSTestBase):
+    def test_hdfs(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=5 * 1000,
+            sleep_inter=100)
+        self._test_rm(fs)
+        self._test_touch(fs)
+        self._test_dirs(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_rm(fs)
+        self._test_touch(fs)
+        self._test_dirs(fs)
+
+        self._test_touch_file(fs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a045f4b17fc9b8b68ccf81a23cb953db58a9db7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
+
+
+class FSTest3(FSTestBase):
+    def test_hdfs(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=5 * 1000,
+            sleep_inter=100)
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+        self._test_try_upload(fs)
+        self._test_try_download(fs)
+
+        self._test_upload(fs)
+        self._test_download(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+        self._test_try_upload(fs)
+        self._test_try_download(fs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f3858d6fb242b8689bd1d300861faf8ed73e54
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -0,0 +1,273 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import six
+from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
+
+
+class SimpleConv(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(SimpleConv, self).__init__()
+        self._conv = fluid.dygraph.Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=None,
+            use_cudnn=True)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class TestAutoCast(unittest.TestCase):
+    def test_amp_guard_white_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp16 = conv2d(data)
+
+            with fluid.dygraph.amp_guard(False):
+                out_fp32 = conv2d(data)
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
+    def test_amp_guard_black_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp32 = fluid.layers.mean(data)
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
+    def test_custom_op_list(self):
+        with fluid.dygraph.guard():
+            tracer = fluid.framework._dygraph_tracer()
+            base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
+            base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST
+            with fluid.dygraph.amp_guard(
+                    custom_white_list=["log"], custom_black_list=["conv2d"]):
+                white_list, black_list = tracer._get_amp_op_list()
+                self.assertTrue(
+                    set(white_list) ==
+                    (set(base_white_list) | {"log"}) - {"conv2d"})
+
+                self.assertTrue(
+                    set(black_list) ==
+                    (set(base_black_list) - {"log"}) | {"conv2d"})
+
+    def test_custom_op_list_exception(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+
+        def func():
+            with fluid.dygraph.guard():
+                model = SimpleConv(
+                    num_channels=3,
+                    num_filters=64,
+                    filter_size=7,
+                    stride=2,
+                    act='relu')
+
+                with fluid.dygraph.amp_guard(
+                        custom_white_list=["conv2d"],
+                        custom_black_list=["conv2d"]):
+                    inp = fluid.dygraph.to_variable(inp_np)
+                    out = model(inp)
+
+        self.assertRaises(ValueError, func)
+
+
+class TestAmpScaler(unittest.TestCase):
+    def test_scale(self):
+        with fluid.dygraph.guard():
+            data = paddle.rand([10, 1024])
+            scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+            scaled_data = scaler.scale(data)
+            self.assertEqual(
+                np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True)
+
+    def test_minimize(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+
+        def run_simple_conv(inp_np, use_scaler=True):
+            paddle.manual_seed(10)
+            with fluid.dygraph.guard():
+                model = SimpleConv(
+                    num_channels=3,
+                    num_filters=64,
+                    filter_size=7,
+                    stride=2,
+                    act='relu')
+                optimizer = fluid.optimizer.SGDOptimizer(
+                    learning_rate=0.01, parameter_list=model.parameters())
+                scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+                data = fluid.dygraph.to_variable(inp_np)
+
+                out = model(data)
+                loss = fluid.layers.mean(out)
+                if use_scaler:
+                    print('use scaler')
+                    scaled_loss = scaler.scale(loss)
+                    scaled_loss.backward()
+                    optimize_ops, params_grads = scaler.minimize(optimizer,
+                                                                 scaled_loss)
+                else:
+                    print('use no scaler')
+                    loss.backward()
+                    optimize_ops, params_grads = optimizer.minimize(loss)
+            return optimize_ops, params_grads
+
+        outs_with_scaler = run_simple_conv(inp_np, use_scaler=True)
+        outs_no_scaler = run_simple_conv(inp_np, use_scaler=False)
+
+        self.assertEqual(outs_with_scaler[0],
+                         [])  # optimize_ops is [] in dygraph mode
+        self.assertEqual(outs_no_scaler[0],
+                         [])  # optimize_ops is [] in dygraph mode
+        for i in range(len(outs_with_scaler[1])):
+            # check each grad
+            self.assertEqual(
+                np.allclose(outs_with_scaler[1][i][1].numpy(),
+                            outs_no_scaler[1][i][1].numpy()), True)
+            # check each parameter
+            self.assertEqual(
+                np.allclose(outs_with_scaler[1][i][0].numpy(),
+                            outs_no_scaler[1][i][0].numpy()), True)
+
+    def test_nan_inf(self):
+        inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
+        inp_np[0][1][2][3] = np.nan
+        with fluid.dygraph.guard():
+            model = SimpleConv(
+                num_channels=3,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            params_init = {}
+            for param in model.parameters():
+                params_init[param.name] = param.numpy()
+            optimizer = fluid.optimizer.SGDOptimizer(
+                learning_rate=0.01, parameter_list=model.parameters())
+            scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
+            data = fluid.dygraph.to_variable(inp_np)
+
+            out = model(data)
+            loss = fluid.layers.mean(out)
+            scaled_loss = scaler.scale(loss)
+            scaled_loss.backward()
+            optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
+            self.assertEqual(scaler._found_inf.numpy() == 1, True)
+
+            for param in model.parameters():
+                # param not update when tensor contains nan or inf
+                self.assertTrue(
+                    np.array_equal(param.numpy(), params_init[param.name]))
+
+
+class TestResnet(unittest.TestCase):
+    def train_resnet(self, enable_amp=True):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+
+        with fluid.dygraph.guard():
+            paddle.manual_seed(seed)
+
+            resnet = ResNet(use_cudnn=True)
+            optimizer = optimizer_setting(
+                train_parameters, parameter_list=resnet.parameters())
+            np.random.seed(seed)
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            dy_param_init_value = {}
+            for param in resnet.parameters():
+                dy_param_init_value[param.name] = param.numpy()
+
+            program = None
+            scaler = paddle.fluid.dygraph.AmpScaler(
+                enable=enable_amp, init_loss_scaling=2.**10)
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                if len(np.array([x[1]
+                                 for x in data]).astype('int64')) != batch_size:
+                    continue
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    -1, 1)
+                img = fluid.dygraph.to_variable(dy_x_data)
+                label = fluid.dygraph.to_variable(y_data)
+                label.stop_gradient = True
+                with paddle.fluid.dygraph.amp_guard(enable=enable_amp):
+                    out = resnet(img)
+
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+
+                dy_out = avg_loss.numpy()
+
+                scaled_loss = scaler.scale(avg_loss)
+                scaled_loss.backward()
+
+                scaler.minimize(optimizer, scaled_loss)
+
+                dy_grad_value = {}
+                for param in resnet.parameters():
+                    if param.trainable:
+                        np_array = np.array(param._grad_ivar().value()
+                                            .get_tensor())
+                        dy_grad_value[param.name + fluid.core.grad_var_suffix(
+                        )] = np_array
+
+                resnet.clear_gradients()
+
+                dy_param_value = {}
+                for param in resnet.parameters():
+                    dy_param_value[param.name] = param.numpy()
+
+        return dy_out, dy_param_value, dy_grad_value
+
+    def test_resnet(self):
+        out_fp32 = self.train_resnet(enable_amp=False)
+        out_amp = self.train_resnet(enable_amp=True)
+        print(out_fp32[0], out_amp[0])
+        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 9b6c307bbec5d272aa3c5644aeaabfe9d7f5df8f..f83f8ef35215e5a0199c4d63744882126212b928 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -21,6 +21,7 @@ from paddle.fluid import core
 from paddle.fluid import Linear
 from test_imperative_base import new_program_scope
 import paddle.fluid.dygraph_utils as dygraph_utils
+from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
 import paddle
 
 
@@ -205,27 +206,28 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.array_equal(dy_grad1, dy_grad2))
 
     def test_functional_paddle_imperative_dygraph_context(self):
-        self.assertFalse(paddle.imperative.enabled())
-        paddle.enable_imperative()
-        self.assertTrue(paddle.imperative.enabled())
+        self.assertFalse(paddle.in_dynamic_mode())
+        paddle.disable_static()
+        self.assertTrue(paddle.in_dynamic_mode())
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.imperative.to_variable(np_inp)
+        var_inp = paddle.to_variable(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out1 = out.numpy()
         out.backward()
         dy_grad1 = mlp._linear1.weight.gradient()
-        paddle.disable_imperative()
-        self.assertFalse(paddle.imperative.enabled())
-        with paddle.imperative.guard():
-            self.assertTrue(paddle.imperative.enabled())
-            var_inp = paddle.imperative.to_variable(np_inp)
-            mlp = MLP(input_size=2)
-            out = mlp(var_inp)
-            dy_out2 = out.numpy()
-            out.backward()
-            dy_grad2 = mlp._linear1.weight.gradient()
-        self.assertFalse(paddle.imperative.enabled())
+        paddle.enable_static()
+        self.assertFalse(paddle.in_dynamic_mode())
+        paddle.disable_static()
+        self.assertTrue(paddle.in_dynamic_mode())
+        var_inp = paddle.to_variable(np_inp)
+        mlp = MLP(input_size=2)
+        out = mlp(var_inp)
+        dy_out2 = out.numpy()
+        out.backward()
+        dy_grad2 = mlp._linear1.weight.gradient()
+        paddle.enable_static()
+        self.assertFalse(paddle.in_dynamic_mode())
         self.assertTrue(np.array_equal(dy_out1, dy_out2))
         self.assertTrue(np.array_equal(dy_grad1, dy_grad2))
 
@@ -281,7 +283,7 @@ class TestImperative(unittest.TestCase):
             l0 = fluid.Linear(2, 2)
             self.assertTrue(l0.weight._grad_ivar() is None)
             l1 = fluid.Linear(2, 2)
-            with paddle.imperative.no_grad():
+            with paddle.no_grad():
                 self.assertTrue(l1.weight.stop_gradient is False)
                 tmp = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
@@ -628,6 +630,16 @@ class TestDygraphUtils(unittest.TestCase):
             res2 = fluid.layers.sigmoid(a)
             self.assertTrue(np.allclose(res1.numpy(), res2.numpy()))
 
+    def test_append_activation_in_dygraph3(self):
+        a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
+        helper = LayerObjectHelper(fluid.unique_name.generate("test"))
+        func = helper.append_activation
+        with fluid.dygraph.guard():
+            a = fluid.dygraph.to_variable(a_np)
+            res1 = func(a, act="sigmoid", use_cudnn=True)
+            res2 = fluid.layers.sigmoid(a)
+            self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
+
     def test_append_bias_in_dygraph_exception(self):
         with new_program_scope():
             np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
index 71b208e2cdd114ba527746d085cb066204c23777..4c9061dd83414219e7b251aafda1a21f92da2b7d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.reader import use_pinned_memory
 
 
 def get_random_images_and_labels(image_shape, label_shape):
@@ -77,6 +78,18 @@ class TestDygraphDataLoader(unittest.TestCase):
                 batch_size=self.batch_size)
             self.iter_loader_data(loader)
 
+    def test_set_pin_memory(self):
+        with fluid.dygraph.guard():
+            use_pinned_memory(False)
+            loader = fluid.io.DataLoader.from_generator(
+                capacity=self.capacity, iterable=False, use_multiprocess=False)
+            loader.set_sample_generator(
+                sample_generator_creator(self.batch_size, self.batch_num),
+                batch_size=self.batch_size,
+                places=fluid.CPUPlace())
+            self.iter_loader_data(loader)
+            use_pinned_memory(True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index a61950f2dc0775fcbad5fd970ee95ed5ebf1c558..d3f488d92ac455072b37274e2ce782bcf41e8cc7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
-            strategy = paddle.imperative.prepare_context()
+            strategy = paddle.prepare_context()
             mlp = MLP()
             parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
index 82e81d72f9a9823817355087d332c3d7fb1ffe5a..820206a3ce630eb92a36a154ca7cdec62de2ce34 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import unittest
@@ -27,7 +28,7 @@ class TestTracerMode(unittest.TestCase):
     def get_tracer_mode(self):
         assert fluid.in_dygraph_mode(), "Dygraph mode must be enabled"
 
-    @fluid.dygraph.no_grad
+    @paddle.no_grad()
     def no_grad_func(self, a):
         self.assertEqual(self.tracer._train_mode, False)
         return a
@@ -55,13 +56,32 @@ class TestTracerMode(unittest.TestCase):
             def need_no_grad_func(a, b=1):
                 return a + b
 
-            decorated_func = fluid.dygraph.no_grad(need_no_grad_func)
+            decorated_func = paddle.no_grad()(need_no_grad_func)
             self.assertTrue(
                 str(inspect.getargspec(decorated_func)) ==
                 str(inspect.getargspec(need_no_grad_func)))
 
             self.assertEqual(self.tracer._train_mode, self.init_mode)
 
+            def test_gen():
+                for i in range(3):
+                    yield i
+
+            a = 0
+            for i in test_gen():
+                a += i
+
+            @paddle.no_grad()
+            def test_wrapped_gen():
+                for i in range(3):
+                    yield i
+
+            b = 0
+            for i in test_wrapped_gen():
+                b += i
+
+            self.assertEqual(a, b)
+
         with fluid.dygraph.guard():
             self.check_not_support_rlt(False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 5c94f1836bf7354464bf9c21129cb14bdfaee160..429736803a192a7cdf01522406f95f8e7c892390 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -298,15 +298,16 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
     def test_compare(self):
-        value = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
+                                                          5).astype("float32")
 
         def model_f(input):
-            conv2d = fluid.dygraph.Conv2D(3, 2, 3)
+            linear = fluid.dygraph.Linear(5, 3, bias_attr=False)
             for i in range(10):
                 if i == 0:
-                    out = conv2d(input)
+                    out = linear(input)
                 else:
-                    out = out + conv2d(input)
+                    out = out + linear(input)
             return out
 
         backward_strategy = fluid.dygraph.BackwardStrategy()
@@ -321,8 +322,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
             dx = fluid.dygraph.grad(
                 outputs=[out],
                 inputs=[a],
-                create_graph=True,
-                retain_graph=True,
+                create_graph=False,
                 only_inputs=True,
                 allow_unused=False,
                 backward_strategy=backward_strategy)
@@ -339,9 +339,25 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
 
             grad_2 = a.gradient()
 
-        self.assertTrue(
-            np.allclose(
-                grad_1, grad_2, rtol=1.e-5, atol=1.e-8, equal_nan=True))
+        self.assertTrue(np.array_equal(grad_1, grad_2))
+
+
+class TestRaiseNoDoubleGradOp(TestCase):
+    def raise_no_grad_op(self):
+        with fluid.dygraph.guard():
+            x = fluid.layers.ones(shape=[2, 3, 2, 2], dtype='float32')
+            x.stop_gradient = False
+            y = paddle.fluid.layers.batch_norm(x)
+
+            dx = fluid.dygraph.grad(
+                outputs=[y], inputs=[x], create_graph=True,
+                retain_graph=True)[0]
+
+            loss = fluid.layers.reduce_mean(dx)
+            loss.backward()
+
+    def test_raise(self):
+        self.assertRaises(fluid.core.EnforceNotMet, self.raise_no_grad_op)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index a391c088a3640c097ff0f4ff714bf50470c575c6..b15ad911ee79d47011be6eaa4bde62ba71c55c0e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -28,11 +28,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
-            nn.Conv2D(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2),
-            nn.Conv2D(
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
             nn.Pool2D(2, 'max', 2))
@@ -61,7 +61,7 @@ def init_weights(layer):
         new_bias = paddle.fill_constant(
             layer.bias.shape, layer.bias.dtype, value=-0.1)
         layer.bias.set_value(new_bias)
-    elif type(layer) == nn.Conv2D:
+    elif type(layer) == nn.Conv2d:
         new_weight = paddle.fill_constant(
             layer.weight.shape, layer.weight.dtype, value=0.7)
         layer.weight.set_value(new_weight)
@@ -81,7 +81,7 @@ class TestLayerApply(unittest.TestCase):
                 if type(layer) == nn.Linear:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.9)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.1)
-                elif type(layer) == nn.Conv2D:
+                elif type(layer) == nn.Conv2d:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.7)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7e0902341a59649219cf94ef9741fdf7ae09233
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+
+import numpy as np
+
+
+class LeNetDygraph(fluid.dygraph.Layer):
+    def __init__(self):
+        super(LeNetDygraph, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Pool2D(2, 'max', 2),
+            nn.Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            nn.ReLU(),
+            nn.Pool2D(2, 'max', 2))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        return x
+
+
+class TestLayerChildren(unittest.TestCase):
+    def test_apply_init_weight(self):
+        with fluid.dygraph.guard():
+            net = LeNetDygraph()
+            net.eval()
+
+            net_layers = nn.Sequential(*list(net.children()))
+            net_layers.eval()
+
+            x = paddle.rand([2, 1, 28, 28])
+
+            y1 = net(x)
+            y2 = net_layers(x)
+
+            np.testing.assert_allclose(y1.numpy(), y2.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index a3c602646b700556cea53a9b06295e38baf705dd..1e509960c076339d2d56ccfcdd7a795fa462ca82 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -153,7 +153,7 @@ class TestImperativeMnist(unittest.TestCase):
                     label.stop_gradient = True
 
                     if batch_id % 10 == 0:
-                        cost, traced_layer = paddle.imperative.TracedLayer.trace(
+                        cost, traced_layer = paddle.jit.TracedLayer.trace(
                             mnist, inputs=img)
                         if program is not None:
                             self.assertTrue(program, traced_layer.program)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f75c92b185ed338eca15cab1b624da97b1fda33
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -0,0 +1,728 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import itertools
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
+from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
+from paddle.fluid.dygraph import Linear
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+
+# Note(wangzhongpu)
+# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.
+
+
+class MLP(fluid.Layer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._fc1 = Linear(784, 10)
+        self._fc2 = Linear(10, 10)
+
+    def forward(self, inputs):
+        y = self._fc1(inputs)
+        y = self._fc2(y)
+        return y
+
+
+class TestImperativeOptimizerBase(unittest.TestCase):
+    def setUp(self):
+        self.batch_num = 20
+
+    def get_optimizer_dygraph(self, parameter_list):
+        raise NotImplementedError()
+
+    def get_optimizer(self):
+        raise NotImplementedError()
+
+    def reader_decorator(self, reader):
+        def _reader_imple():
+            for item in reader():
+                image = np.array(item[0]).reshape(1, 784)
+                label = np.array(item[1]).astype('int64').reshape(1)
+                yield image, label
+
+        return _reader_imple
+
+    def _check_exception(self, exception_message, place=None):
+        seed = 90
+        batch_size = 128
+        if place == None:
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+
+        with fluid.dygraph.guard(place):
+            try:
+                fluid.default_startup_program().random_seed = seed
+                fluid.default_main_program().random_seed = seed
+                mlp = MLP()
+                optimizer = self.get_optimizer_dygraph(
+                    parameter_list=mlp.parameters())
+            except Exception as e:
+                assert str(e) == exception_message
+
+    def _check_mlp(self, place=None):
+        seed = 90
+        batch_size = 128
+
+        if place == None:
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+
+        with fluid.dygraph.guard(place):
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            mlp = MLP()
+            optimizer = self.get_optimizer_dygraph(
+                parameter_list=mlp.parameters())
+
+            batch_py_reader = fluid.io.PyReader(capacity=1)
+            batch_py_reader.decorate_sample_list_generator(
+                paddle.batch(
+                    self.reader_decorator(paddle.dataset.mnist.train()),
+                    batch_size=batch_size,
+                    drop_last=True),
+                places=fluid.CPUPlace())
+
+            dy_param_init_value = {}
+            for batch_id, data in enumerate(batch_py_reader()):
+                if batch_id >= self.batch_num:
+                    break
+
+                img = data[0]
+                label = data[1]
+                label.stop_gradient = True
+
+                img = fluid.layers.reshape(img, shape=[batch_size, -1])
+                cost = mlp(img)
+                avg_loss = fluid.layers.reduce_mean(cost)
+                dy_out = avg_loss.numpy()
+
+                if batch_id == 0:
+                    for param in mlp.parameters():
+                        dy_param_init_value[param.name] = param.numpy()
+
+                avg_loss.backward()
+                optimizer.minimize(avg_loss)
+                mlp.clear_gradients()
+                dy_param_value = {}
+                for param in mlp.parameters():
+                    dy_param_value[param.name] = param.numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            if place == None:
+                place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+                ) else fluid.CUDAPlace(0)
+
+            exe = fluid.Executor(place)
+
+            mlp = MLP()
+            optimizer = self.get_optimizer()
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            img = fluid.layers.reshape(img, shape=[batch_size, 784])
+            cost = mlp(img)
+            avg_loss = fluid.layers.reduce_mean(cost)
+            optimizer.minimize(avg_loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mlp.parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= self.batch_num:
+                    break
+
+                static_x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [128, 1])
+
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_out = out[0]
+                for i in range(1, len(out)):
+                    static_param_value[static_param_name_list[i - 1]] = out[i]
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+
+
+class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd,
+                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.natural_exp_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = Adam(
+            learning_rate=fluid.layers.inverse_time_decay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+
+    def test_adam(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.polynomial_decay(
+                learning_rate=0.1, decay_steps=5, cycle=self.cycle),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
+            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
+        return optimizer
+
+    def test_sgd_cycle(self):
+        self.cycle = True
+        self._check_mlp()
+
+    def test_sgd(self):
+        self.cycle = False
+        self._check_mlp()
+
+
+class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.cosine_decay(
+                learning_rate=0.1, step_each_epoch=10000, epochs=120),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
+            learning_rate=0.1, step_each_epoch=10000, epochs=120))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.noam_decay(
+                d_model=512, warmup_steps=8000),
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
+            d_model=512, warmup_steps=8000))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestOptimizerLearningRate(unittest.TestCase):
+    def test_constant_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 0.001, rtol=1e-06, atol=0.0))
+
+            for i in range(10):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+
+                self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0))
+
+    def test_lr_decay(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            bd = [2, 4, 6, 8]
+            value = [0.2, 0.4, 0.6, 0.8, 1.0]
+
+            adam = paddle.optimizer.Adam(
+                fluid.dygraph.PiecewiseDecay(bd, value, 0),
+                parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 0.2, rtol=1e-06, atol=0.0))
+
+            ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
+            for i in range(12):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+
+                self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+
+    def test_lr_decay_natural_exp(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+            base_lr = 1.0
+
+            adam = paddle.optimizer.Adam(
+                fluid.dygraph.NaturalExpDecay(
+                    learning_rate=base_lr,
+                    decay_steps=3,
+                    decay_rate=0.5,
+                    staircase=True),
+                parameters=linear.parameters())
+
+            self.assertTrue(
+                np.allclose(
+                    adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
+
+            ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)]
+            for i in range(5):
+                adam.minimize(loss)
+                lr = adam.get_lr()
+
+                self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
+
+    def test_set_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+            for i in range(5):
+                adam.set_lr(lr_list[i])
+                adam.minimize(loss)
+                lr = adam.get_lr()
+                self.assertTrue(
+                    np.allclose(
+                        lr, lr_list[i], rtol=1e-06, atol=0.0))
+
+            lr_var = fluid.layers.create_global_var(
+                shape=[1], value=0.7, dtype='float32')
+            adam.set_lr(lr_var)
+            adam.minimize(loss)
+            lr = adam.get_lr()
+            self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0))
+
+            with self.assertRaises(RuntimeError):
+                adam = paddle.optimizer.Adam(
+                    fluid.dygraph.NaturalExpDecay(
+                        learning_rate=0.1,
+                        decay_steps=3,
+                        decay_rate=0.5,
+                        staircase=True),
+                    parameters=linear.parameters())
+                adam.set_lr(0.01)
+
+
+class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        return optimizer
+
+    def test_momentum(self):
+        self._check_mlp()
+
+
+class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = LarsMomentumOptimizer(
+            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        return optimizer
+
+    def test_larsmomentum(self):
+        self._check_mlp()
+
+
+class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdagradOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdagradOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_adagrad(self):
+        self._check_mlp()
+
+
+class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdamaxOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdamaxOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_adamax(self):
+        self._check_mlp()
+
+
+class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DpsgdOptimizer(
+            learning_rate=0.01,
+            clip=10.0,
+            batch_size=16.0,
+            sigma=1.0,
+            parameter_list=parameter_list)
+        optimizer._seed = 100
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = DpsgdOptimizer(
+            learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
+        optimizer._seed = 100
+        return optimizer
+
+    def test_dpsgd(self):
+        self._check_mlp(place=fluid.CPUPlace())
+
+
+class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DecayedAdagradOptimizer(
+            learning_rate=0.2, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = DecayedAdagradOptimizer(learning_rate=0.2)
+        return optimizer
+
+    def test_decayadagrad(self):
+        self._check_mlp()
+
+
+class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = AdadeltaOptimizer(
+            learning_rate=0.0003,
+            epsilon=1.0e-6,
+            rho=0.95,
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = AdadeltaOptimizer(
+            learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+        return optimizer
+
+    def test_adadelta(self):
+        self._check_mlp()
+
+
+class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = RMSPropOptimizer(
+            learning_rate=0.1, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = RMSPropOptimizer(learning_rate=0.1)
+        return optimizer
+
+    def test_rmsprop(self):
+        self._check_mlp()
+
+
+class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = FtrlOptimizer(
+            learning_rate=0.1, parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = FtrlOptimizer(learning_rate=0.1)
+        return optimizer
+
+    def test_ftrl(self):
+        self._check_mlp()
+
+
+def exclude_fn(param):
+    return param.name.endswith('.b_0')
+
+
+class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = LambOptimizer(
+            learning_rate=0.002,
+            exclude_from_weight_decay_fn=exclude_fn,
+            parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = LambOptimizer(
+            learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn)
+        return optimizer
+
+    def test_lamb(self):
+        self._check_mlp()
+
+
+class TestImperativeModelAverage(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = ModelAverage(
+            0.15, min_average_window=10000, max_average_window=12500)
+        return optimizer
+
+    def test_modelaverage(self):
+        exception_message = "In dygraph, don't support ModelAverage."
+        self._check_exception(exception_message)
+
+
+class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = DGCMomentumOptimizer(
+            learning_rate=0.0001,
+            momentum=0.9,
+            rampup_step=1000,
+            rampup_begin_step=1252,
+            sparsity=[0.999, 0.999])
+        return optimizer
+
+    def test_dgcmomentum(self):
+        exception_message = "In dygraph, don't support DGCMomentumOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = ExponentialMovingAverage(0.999)
+        return optimizer
+
+    def test_exponentialmoving(self):
+        exception_message = "In dygraph, don't support ExponentialMovingAverage."
+        self._check_exception(exception_message)
+
+
+class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = PipelineOptimizer(optimizer)
+        return optimizer
+
+    def test_pipline(self):
+        exception_message = "In dygraph, don't support PipelineOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
+        return optimizer
+
+    def test_lookahead(self):
+        exception_message = "In dygraph, don't support LookaheadOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.5,
+                                         parameter_list=parameter_list)
+        optimizer = RecomputeOptimizer(optimizer)
+        return optimizer
+
+    def test_recompute(self):
+        exception_message = "In dygraph, don't support RecomputeOptimizer."
+        self._check_exception(exception_message)
+
+
+class TestImperativeOptimizerList(unittest.TestCase):
+    def test_parameter_list(self):
+        with fluid.dygraph.guard():
+            linear_1 = Linear(10, 10)
+            linear_2 = Linear(10, 10)
+
+            sgd = SGDOptimizer(
+                1.0,
+                parameter_list=itertools.chain(linear_1.parameters(),
+                                               linear_2.parameters()))
+
+            in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            in_data = fluid.dygraph.to_variable(in_np)
+
+            y = linear_1(in_data)
+            y = linear_2(y)
+            loss = fluid.layers.reduce_mean(y)
+            loss.backward()
+            sgd.minimize(loss)
+
+            self.assertTrue(
+                len(sgd._parameter_list) ==
+                len(linear_1.parameters() + linear_2.parameters()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 106f58ccc99ffe42b77466e6dbf7b773ecee4ee2..815437072fde291b8d8348dba0b4b0ae872ec1b9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -83,7 +83,8 @@ class ConvBNLayer(fluid.Layer):
                  filter_size,
                  stride=1,
                  groups=1,
-                 act=None):
+                 act=None,
+                 use_cudnn=False):
         super(ConvBNLayer, self).__init__()
 
         self._conv = Conv2D(
@@ -94,8 +95,8 @@ class ConvBNLayer(fluid.Layer):
             padding=(filter_size - 1) // 2,
             groups=groups,
             act=None,
-            bias_attr=None,
-            use_cudnn=False)
+            bias_attr=False,
+            use_cudnn=use_cudnn)
 
         self._batch_norm = BatchNorm(num_filters, act=act)
 
@@ -107,32 +108,41 @@ class ConvBNLayer(fluid.Layer):
 
 
 class BottleneckBlock(fluid.Layer):
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 use_cudnn=False):
         super(BottleneckBlock, self).__init__()
 
         self.conv0 = ConvBNLayer(
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
-            act='relu')
+            act='relu',
+            use_cudnn=use_cudnn)
         self.conv1 = ConvBNLayer(
             num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
-            act='relu')
+            act='relu',
+            use_cudnn=use_cudnn)
         self.conv2 = ConvBNLayer(
             num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
-            act=None)
+            act=None,
+            use_cudnn=use_cudnn)
 
         if not shortcut:
             self.short = ConvBNLayer(
                 num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
-                stride=stride)
+                stride=stride,
+                use_cudnn=use_cudnn)
 
         self.shortcut = shortcut
 
@@ -153,7 +163,7 @@ class BottleneckBlock(fluid.Layer):
 
 
 class ResNet(fluid.Layer):
-    def __init__(self, layers=50, class_dim=102):
+    def __init__(self, layers=50, class_dim=102, use_cudnn=False):
         super(ResNet, self).__init__()
 
         self.layers = layers
@@ -171,7 +181,12 @@ class ResNet(fluid.Layer):
         num_filters = [64, 128, 256, 512]
 
         self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            use_cudnn=use_cudnn)
         self.pool2d_max = Pool2D(
             pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
 
@@ -186,7 +201,8 @@ class ResNet(fluid.Layer):
                         if i == 0 else num_filters[block] * 4,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut))
+                        shortcut=shortcut,
+                        use_cudnn=use_cudnn))
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 927e51b56d727f92b75930eb0915fb5da8931f01..eb9dc926c8207f4de4a6ce7e3d0dc89cc2b965fd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -292,7 +292,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.imperative.save(self.state_dict, "./test_dy")
+            paddle.save(self.state_dict, "./test_dy")
 
     def testLoadAndSetVarBase(self):
         seed = 90
@@ -373,8 +373,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
-                "./test_dy")
+            para_state_dict, opti_state_dict = paddle.load("./test_dy")
             adam.set_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -900,18 +899,17 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.imperative.save(state_dict,
-                                   os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
+            para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy'))
 
             self.assertTrue(opti_state_dict == None)
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
+            para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.imperative.load(
+            para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab35a21aff43af822821c14007fbdd69a081803
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -0,0 +1,917 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.dygraph.nn import Embedding, Linear
+import paddle.fluid.framework as framework
+from paddle.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+from test_imperative_base import new_program_scope
+import numpy as np
+import six
+import paddle
+
+
+class SimpleLSTMRNN(fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_steps,
+                 num_layers=2,
+                 init_scale=0.1,
+                 dropout=None):
+        super(SimpleLSTMRNN, self).__init__()
+        self._hidden_size = hidden_size
+        self._num_layers = num_layers
+        self._init_scale = init_scale
+        self._dropout = dropout
+        self._input = None
+        self._num_steps = num_steps
+        self.cell_array = []
+        self.hidden_array = []
+        self.weight_1_arr = []
+        self.weight_2_arr = []
+        self.bias_arr = []
+        self.mask_array = []
+
+        for i in range(self._num_layers):
+            weight_1 = self.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 2, self._hidden_size * 4],
+                dtype="float32",
+                default_initializer=fluid.initializer.UniformInitializer(
+                    low=-self._init_scale, high=self._init_scale))
+            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
+            bias_1 = self.create_parameter(
+                attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-self._init_scale, high=self._init_scale)),
+                shape=[self._hidden_size * 4],
+                dtype="float32",
+                default_initializer=fluid.initializer.Constant(0.0))
+            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
+
+    def forward(self, input_embedding, init_hidden=None, init_cell=None):
+        self.cell_array = []
+        self.hidden_array = []
+
+        for i in range(self._num_layers):
+            pre_hidden = fluid.layers.slice(
+                init_hidden, axes=[0], starts=[i], ends=[i + 1])
+            pre_cell = fluid.layers.slice(
+                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(
+                pre_hidden, shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(
+                pre_cell, shape=[-1, self._hidden_size])
+            self.hidden_array.append(pre_hidden)
+            self.cell_array.append(pre_cell)
+
+        res = []
+        for index in range(self._num_steps):
+            self._input = fluid.layers.slice(
+                input_embedding, axes=[1], starts=[index], ends=[index + 1])
+            self._input = fluid.layers.reshape(
+                self._input, shape=[-1, self._hidden_size])
+            for k in range(self._num_layers):
+                pre_hidden = self.hidden_array[k]
+                pre_cell = self.cell_array[k]
+                weight_1 = self.weight_1_arr[k]
+                bias = self.bias_arr[k]
+
+                nn = fluid.layers.concat([self._input, pre_hidden], 1)
+                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+
+                gate_input = fluid.layers.elementwise_add(gate_input, bias)
+                i, j, f, o = fluid.layers.split(
+                    gate_input, num_or_sections=4, dim=-1)
+                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                    i) * fluid.layers.tanh(j)
+                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                self.hidden_array[k] = m
+                self.cell_array[k] = c
+                self._input = m
+
+                if self._dropout is not None and self._dropout > 0.0:
+                    self._input = fluid.layers.dropout(
+                        self._input,
+                        dropout_prob=self._dropout,
+                        dropout_implementation='upscale_in_train')
+            res.append(
+                fluid.layers.reshape(
+                    self._input, shape=[1, -1, self._hidden_size]))
+        real_res = fluid.layers.concat(res, 0)
+        real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
+        last_hidden = fluid.layers.concat(self.hidden_array, 1)
+        last_hidden = fluid.layers.reshape(
+            last_hidden, shape=[-1, self._num_layers, self._hidden_size])
+        last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
+        last_cell = fluid.layers.concat(self.cell_array, 1)
+        last_cell = fluid.layers.reshape(
+            last_cell, shape=[-1, self._num_layers, self._hidden_size])
+        last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
+        return real_res, last_hidden, last_cell
+
+
+class PtbModel(fluid.Layer):
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 num_layers=2,
+                 num_steps=20,
+                 init_scale=0.1,
+                 dropout=None):
+        super(PtbModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.init_scale = init_scale
+        self.num_layers = num_layers
+        self.num_steps = num_steps
+        self.dropout = dropout
+        self.simple_lstm_rnn = SimpleLSTMRNN(
+            hidden_size,
+            num_steps,
+            num_layers=num_layers,
+            init_scale=init_scale,
+            dropout=dropout)
+        self.embedding = Embedding(
+            size=[vocab_size, hidden_size],
+            dtype='float32',
+            is_sparse=False,
+            param_attr=fluid.ParamAttr(
+                name='embedding_para',
+                initializer=fluid.initializer.UniformInitializer(
+                    low=-init_scale, high=init_scale)))
+
+        self.softmax_weight = self.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.hidden_size, self.vocab_size],
+            dtype="float32",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+        self.softmax_bias = self.create_parameter(
+            attr=fluid.ParamAttr(),
+            shape=[self.vocab_size],
+            dtype="float32",
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-self.init_scale, high=self.init_scale))
+
+    def forward(self, input, label, init_hidden, init_cell):
+        init_h = fluid.layers.reshape(
+            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
+
+        init_c = fluid.layers.reshape(
+            init_cell, shape=[self.num_layers, -1, self.hidden_size])
+
+        x_emb = self.embedding(input)
+        x_emb = fluid.layers.reshape(
+            x_emb, shape=[-1, self.num_steps, self.hidden_size])
+        if self.dropout is not None and self.dropout > 0.0:
+            x_emb = fluid.layers.dropout(
+                x_emb,
+                dropout_prob=self.drop_out,
+                dropout_implementation='upscale_in_train')
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
+                                                               init_c)
+        rnn_out = fluid.layers.reshape(
+            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
+
+        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
+        projection = fluid.layers.reshape(
+            projection, shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=projection, label=label, soft_label=False)
+        loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
+        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = fluid.layers.reduce_sum(loss)
+
+        return loss, last_hidden, last_cell
+
+
+class TestDygraphPtbRnn(unittest.TestCase):
+    def setUp(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr_arr),
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            self.opti_dict = adam.state_dict()
+            self.base_opti = {}
+            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.base_opti[v.name] = v.numpy()
+                    self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
+                else:
+                    self.base_opti[k] = v
+
+            fluid.save_dygraph(self.opti_dict, "./test_dy")
+
+            self.state_dict = ptb_model.state_dict()
+
+            self.model_base = {}
+            for k, v in self.state_dict.items():
+                np_t = v.numpy()
+                self.model_base[k] = np_t
+
+            paddle.save(self.state_dict, "./test_dy")
+
+    def testLoadAndSetVarBase(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr_arr),
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            adam.set_state_dict(opti_state_dict)
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(para_state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetVariable(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr_arr),
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            adam.set_state_dict(self.opti_dict)
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(self.state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetNumpy(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [1.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                new_lr = 1.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr_arr),
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            for i in range(batch_num):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+                if i == 0:
+                    for param in ptb_model.parameters():
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param.numpy()
+
+            # check optimizer
+            opti_dict = adam.state_dict()
+            np_opti_dict = {}
+            # set to zero
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    np_opti_dict[v.name] = np_t
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                else:
+                    np_opti_dict[k] = v
+
+            if isinstance(adam._learning_rate, LearningRateDecay):
+                adam._learning_rate.step_num = 0
+
+            adam.set_state_dict(np_opti_dict)
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
+
+            # check parameter
+            state_dict = ptb_model.state_dict()
+            np_state_dict = {}
+            for k, v in state_dict.items():
+                np_t = v.numpy()
+                np_state_dict[k] = np_t
+                var = v.value().get_tensor()
+
+                var.set(np.zeros_like(np_t), place)
+
+            ptb_model.set_dict(np_state_dict)
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetVariableBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=0.0,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            adam.set_state_dict(self.opti_dict)
+            ptb_model.set_dict(self.state_dict)
+
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testLoadAndSetVarBaseBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [0.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                # set lr to zero not update parameter
+                new_lr = 0.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=0.0,
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            state_dict, opti_dict = fluid.load_dygraph("./test_dy")
+            adam.set_state_dict(opti_dict)
+            ptb_model.set_dict(state_dict)
+
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            # check parameter
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testSetNumpyBeforeTrain(self):
+        seed = 90
+        hidden_size = 10
+        vocab_size = 1000
+        num_layers = 1
+        num_steps = 3
+        init_scale = 0.1
+        batch_size = 4
+        batch_num = 200
+
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            # TODO: marsyang1993 Change seed to
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            bd = []
+            lr_arr = [0.0]
+            # this a fake lr decay strategy
+            for i in range(1, 10):
+                bd.append(100 * i)
+                # set lr to 0.0, not update parameter
+                new_lr = 0.0
+                lr_arr.append(new_lr)
+
+            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            adam = Adam(
+                learning_rate=fluid.layers.piecewise_decay(
+                    boundaries=bd, values=lr_arr),
+                beta1=0.8,
+                beta2=0.6,
+                parameters=ptb_model.parameters())
+            dy_param_updated = dict()
+            dy_param_init = dict()
+            dy_loss = None
+            last_hidden = None
+            last_cell = None
+
+            np_opti_dict = {}
+            np_state_dict = {}
+
+            for k, v in self.opti_dict.items():
+                if isinstance(v, core.VarBase):
+                    np_opti_dict[v.name] = v.numpy()
+                else:
+                    np_opti_dict[k] = v
+
+            for k, v in self.state_dict.items():
+                np_state_dict[k] = v.numpy()
+
+            adam.set_state_dict(np_opti_dict)
+            ptb_model.set_dict(np_state_dict)
+            for i in range(1):
+                x_data = np.arange(12).reshape(4, 3).astype('int64')
+                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+                y_data = y_data.reshape((-1, 1))
+                init_hidden_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros(
+                    (num_layers, batch_size, hidden_size), dtype='float32')
+                x = to_variable(x_data)
+                y = to_variable(y_data)
+                init_hidden = to_variable(init_hidden_data)
+                init_cell = to_variable(init_cell_data)
+                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                            init_cell)
+
+                dy_loss.backward()
+                adam.minimize(dy_loss)
+                ptb_model.clear_gradients()
+
+            opti_dict = adam.state_dict()
+            for k, v in opti_dict.items():
+                if k == "global_step":
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
+
+                if k.find("beta1_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta1))
+                if k.find("beta2_pow_acc_0") > 0:
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name] *
+                                       adam._beta2))
+
+            # check parameter
+
+            state_dict = ptb_model.state_dict()
+
+            for k, v in state_dict.items():
+                new_t = v.numpy()
+
+                base_t = self.model_base[k]
+                self.assertTrue(np.array_equal(new_t, base_t))
+
+    def testOnlyLoadParams(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'))
+
+            self.assertTrue(opti_state_dict == None)
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdopt'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 2789174ba7a5805b86557a9a465c661a906bc0a7..9878e2f9ad772fe3d03addb4ced9f3b66a6cd58a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -47,33 +47,34 @@ class TestSimpleNet(unittest.TestCase):
         for place in places:
             for dtype in ["float32", "float64"]:
                 for sort_sum_gradient in [True, False]:
-                    with paddle.imperative.guard(place):
-                        backward_strategy = paddle.imperative.BackwardStrategy()
-                        backward_strategy.sort_sum_gradient = sort_sum_gradient
-                        # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    paddle.disable_static(place)
+                    backward_strategy = paddle.BackwardStrategy()
+                    backward_strategy.sort_sum_gradient = sort_sum_gradient
+                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
-                        input_word = np.array([[1, 2], [2, 1]]).astype('int64')
-                        input = paddle.imperative.to_variable(input_word)
+                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
+                    input = paddle.to_variable(input_word)
 
-                        simplenet = SimpleNet(20, 32, dtype)
-                        adam = SGDOptimizer(
-                            learning_rate=0.001,
-                            parameter_list=simplenet.parameters(
-                            ))  # grad_clip=grad_clip
-                        input_emb, emb = simplenet(input)
+                    simplenet = SimpleNet(20, 32, dtype)
+                    adam = SGDOptimizer(
+                        learning_rate=0.001,
+                        parameter_list=simplenet.parameters(
+                        ))  # grad_clip=grad_clip
+                    input_emb, emb = simplenet(input)
 
-                        self.assertTrue(emb.weight.gradient() is None)
-                        self.assertTrue(input_emb.gradient() is None)
+                    self.assertTrue(emb.weight.gradient() is None)
+                    self.assertTrue(input_emb.gradient() is None)
 
-                        input_emb.backward(backward_strategy)
-                        adam.minimize(input_emb)
-                        self.assertTrue(emb.weight.gradient() is not None)
+                    input_emb.backward(backward_strategy)
+                    adam.minimize(input_emb)
+                    self.assertTrue(emb.weight.gradient() is not None)
 
-                        emb.clear_gradients()
-                        self.assertTrue(emb.weight.gradient() is None)
+                    emb.clear_gradients()
+                    self.assertTrue(emb.weight.gradient() is None)
 
-                        input_emb.clear_gradient()
-                        self.assertTrue(input_emb.gradient() is not None)
+                    input_emb.clear_gradient()
+                    self.assertTrue(input_emb.gradient() is not None)
+                    paddle.enable_static()
 
     def test_selectedrows_gradient2(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index afdab0148cbf078db6e183c7e6105fdb27d3266b..acc56b7db27f48cad92ed44cddfcfd4b9591dba3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -25,6 +25,8 @@ import paddle.fluid as fluid
 from paddle.fluid import core
 from test_imperative_base import new_program_scope
 
+LOADED_VAR_SUFFIX = ".load_0"
+
 
 def convolutional_neural_network(img):
     conv_pool_1 = fluid.nets.simple_img_conv_pool(
@@ -307,14 +309,14 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         self.assertTrue(np.array_equal(static_x_data, dy_x_data))
 
         for key, value in six.iteritems(static_param_init_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         # np.testing.assert_array_almost_equal(static_out, dy_out)
         self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04))
 
         for key, value in six.iteritems(static_param_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-4))
 
     def test_mnist_train_with_params_filename(self):
@@ -335,14 +337,14 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
         self.assertTrue(np.array_equal(static_x_data, dy_x_data))
 
         for key, value in six.iteritems(static_param_init_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         # np.testing.assert_array_almost_equal(static_out, dy_out)
         self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04))
 
         for key, value in six.iteritems(static_param_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-4))
 
     def test_mnist_infer_no_params_filename(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index f501593d09d85a357113e075514c7d7cfbecaa2a..0792582175ef03cba3d3ba809132f3c591ecfe87 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -27,6 +27,8 @@ from test_imperative_base import new_program_scope
 
 import paddle.fluid.transpiler.details.program_utils as pu
 
+LOADED_VAR_SUFFIX = ".load_0"
+
 
 def while_softmax_regression(img):
     def cond(i, times, pred):
@@ -219,13 +221,13 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
 
         # Phase 3. compare
         for key, value in six.iteritems(static_param_init_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            key += core.loaded_var_suffix()
+            key += LOADED_VAR_SUFFIX
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestInstanceNorm(unittest.TestCase):
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inverse_op.py b/python/paddle/fluid/tests/unittests/test_inverse_op.py
index 5349654ac27800d2e70c4b77f6531853178fd3ed..fd540dcd741eef4c007eae19a982bc186c09d7d7 100644
--- a/python/paddle/fluid/tests/unittests/test_inverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inverse_op.py
@@ -89,8 +89,7 @@ class TestInverseAPI(unittest.TestCase):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[4, 4], dtype="float64")
-            result = paddle.inverse(input=input)
-
+            result = paddle.inverse(x=input)
             input_np = np.random.random([4, 4]).astype("float64")
             result_np = np.linalg.inv(input_np)
 
@@ -145,7 +144,7 @@ class TestInverseSingularAPI(unittest.TestCase):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[4, 4], dtype="float64")
-            result = paddle.inverse(input=input)
+            result = paddle.inverse(x=input)
 
             input_np = np.zeros([4, 4]).astype("float64")
 
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a868e751f0567e6387b0e9471f0382c9456bcb6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+
+
+def run_static(x_np, dtype, op_str, use_gpu=False):
+    paddle.enable_static()
+    startup_program = fluid.Program()
+    main_program = fluid.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    with fluid.program_guard(main_program, startup_program):
+        x = paddle.data(name='x', shape=x_np.shape, dtype=dtype)
+        res = getattr(paddle.tensor, op_str)(x)
+        exe.run(startup_program)
+        static_result = exe.run(main_program,
+                                feed={'x': x_np},
+                                fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, op_str, use_gpu=True):
+    place = paddle.CPUPlace()
+    if use_gpu and fluid.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x = paddle.to_variable(x_np)
+    dygraph_result = getattr(paddle.tensor, op_str)(x)
+    return dygraph_result
+
+
+def np_data_generator(low, high, np_shape, type, sv_list, op_str, *args,
+                      **kwargs):
+    x_np = np.random.uniform(low, high, np_shape).astype(getattr(np, type))
+    # x_np.shape[0] >= len(sv_list)
+    if type in ['float16', 'float32', 'float64']:
+        for i, v in enumerate(sv_list):
+            x_np[i] = v
+    ori_shape = x_np.shape
+    x_np = x_np.reshape((np.product(ori_shape), ))
+    np.random.shuffle(x_np)
+    x_np = x_np.reshape(ori_shape)
+    result_np = getattr(np, op_str)(x_np)
+    return x_np, result_np
+
+
+TEST_META_DATA = [
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [8, 17, 5, 6, 7],
+        'type': 'float16',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [11, 17],
+        'type': 'float32',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0.1,
+        'high': 1,
+        'np_shape': [2, 3, 4, 5],
+        'type': 'float64',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0,
+        'high': 100,
+        'np_shape': [11, 17, 10],
+        'type': 'int32',
+        'sv_list': [np.inf, np.nan]
+    },
+    {
+        'low': 0,
+        'high': 999,
+        'np_shape': [132],
+        'type': 'int64',
+        'sv_list': [np.inf, np.nan]
+    },
+]
+
+
+def test(test_case, op_str, use_gpu=False):
+    for meta_data in TEST_META_DATA:
+        meta_data = dict(meta_data)
+        meta_data['op_str'] = op_str
+        x_np, result_np = np_data_generator(**meta_data)
+        static_result = run_static(x_np, meta_data['type'], op_str, use_gpu)
+        dygraph_result = run_dygraph(x_np, op_str, use_gpu)
+        test_case.assertTrue((static_result == result_np).all())
+        test_case.assertTrue((dygraph_result.numpy() == result_np).all())
+
+
+class TestCPUNormal(unittest.TestCase):
+    def test_inf(self):
+        test(self, 'isinf')
+
+    def test_nan(self):
+        test(self, 'isnan')
+
+    def test_finite(self):
+        test(self, 'isfinite')
+
+
+class TestCUDANormal(unittest.TestCase):
+    def test_inf(self):
+        test(self, 'isinf', True)
+
+    def test_nan(self):
+        test(self, 'isnan', True)
+
+    def test_finite(self):
+        test(self, 'isfinite', True)
+
+
+class TestError(unittest.TestCase):
+    def test_bad_input(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+
+            def test_isinf_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isinf(x)
+
+            self.assertRaises(TypeError, test_isinf_bad_x)
+
+            def test_isnan_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isnan(x)
+
+            self.assertRaises(TypeError, test_isnan_bad_x)
+
+            def test_isfinite_bad_x():
+                x = [1, 2, 3]
+                result = paddle.tensor.isfinite(x)
+
+            self.assertRaises(TypeError, test_isfinite_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index abc46034957cf7414310f0f593f3bcce71a6d1de..4d7711a5df9fc3b70bcb3137dee0bcc949135266 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -14,31 +14,34 @@
 
 from __future__ import print_function
 
+import os
+import pickle
 import unittest
 import numpy as np
 
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
-from paddle.fluid.dygraph import declarative
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
 
 BATCH_SIZE = 32
-BATCH_NUM = 20
+BATCH_NUM = 10
 SEED = 10
 
 
-def random_batch_reader():
-    def _get_random_images_and_labels(image_shape, label_shape):
+def random_batch_reader(input_size, label_size):
+    def _get_random_inputs_and_labels(input_size, label_size):
         np.random.seed(SEED)
-        image = np.random.random(size=image_shape).astype('float32')
-        label = np.random.random(size=label_shape).astype('int64')
-        return image, label
+        input = np.random.random(size=input_size).astype('float32')
+        label = np.random.random(size=label_size).astype('int64')
+        return input, label
 
     def __reader__():
         for _ in range(BATCH_NUM):
-            batch_image, batch_label = _get_random_images_and_labels(
-                [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-            yield batch_image, batch_label
+            batch_input, batch_label = _get_random_inputs_and_labels(
+                [BATCH_SIZE, input_size], [BATCH_SIZE, label_size])
+            yield batch_input, batch_label
 
     return __reader__
 
@@ -75,13 +78,14 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
         return z, loss
 
 
-def train(layer):
+def train(layer, input_size=784, label_size=1):
     # create optimizer
-    adam = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.1, parameter_list=layer.parameters())
+    adam = fluid.optimizer.SGDOptimizer(
+        learning_rate=0.01, parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-    train_loader.set_batch_generator(random_batch_reader())
+    train_loader.set_batch_generator(
+        random_batch_reader(input_size, label_size))
     # train
     for data in train_loader():
         img, label = data
@@ -98,11 +102,6 @@ def train(layer):
     return [img], layer, avg_loss
 
 
-def infer(layer):
-    x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
-    return layer(x)
-
-
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
         self.model_path = "model.test_jit_save_load"
@@ -111,37 +110,43 @@ class TestJitSaveLoad(unittest.TestCase):
         # config seed
         fluid.default_main_program().random_seed = SEED
 
-    def train_and_save_model(self):
+    def train_and_save_model(self, model_path=None, configs=None):
         layer = LinearNet(784, 1)
         example_inputs, layer, _ = train(layer)
+        final_model_path = model_path if model_path else self.model_path
         orig_input_types = [type(x) for x in example_inputs]
         fluid.dygraph.jit.save(
-            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+            layer=layer,
+            model_path=final_model_path,
+            input_spec=example_inputs,
+            configs=configs)
         new_input_types = [type(x) for x in example_inputs]
         self.assertEqual(orig_input_types, new_input_types)
         return layer
 
-    def test_save(self):
-        # train and save model
-        self.train_and_save_model()
-
-    def test_load_infernece(self):
+    def test_save_load(self):
         # train and save model
         train_layer = self.train_and_save_model()
         # load model
-        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        program_translator = ProgramTranslator()
+        program_translator.enable(False)
+        loaded_layer = fluid.dygraph.jit.load(self.model_path)
+        self.load_and_inference(train_layer, loaded_layer)
+        self.load_dygraph_state_dict(train_layer)
+        self.load_and_finetune(train_layer, loaded_layer)
+        program_translator.enable(True)
+
+    def load_and_inference(self, train_layer, infer_layer):
         train_layer.eval()
+        infer_layer.eval()
         # inference & compare
         x = fluid.dygraph.to_variable(
             np.random.random((1, 784)).astype('float32'))
         self.assertTrue(
             np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
 
-    def test_load_finetune(self):
-        # train and save model
-        train_layer = self.train_and_save_model()
-        # load model
-        load_train_layer = fluid.dygraph.jit.load(self.model_path)
+    def load_and_finetune(self, train_layer, load_train_layer):
+        train_layer.train()
         load_train_layer.train()
         # train & compare
         _, _, train_loss = train(train_layer)
@@ -149,6 +154,19 @@ class TestJitSaveLoad(unittest.TestCase):
         self.assertTrue(
             np.array_equal(train_loss.numpy(), load_train_loss.numpy()))
 
+    def load_dygraph_state_dict(self, train_layer):
+        train_layer.eval()
+        # contruct new model
+        new_layer = LinearNet(784, 1)
+        model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        new_layer.set_dict(model_dict)
+        new_layer.eval()
+        # inference & compare
+        x = fluid.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
+
     def test_save_get_program_failed(self):
         layer = LinearNetNotDeclarative(784, 1)
         example_inputs, layer, _ = train(layer)
@@ -158,6 +176,31 @@ class TestJitSaveLoad(unittest.TestCase):
                 model_path=self.model_path,
                 input_spec=example_inputs)
 
+    def test_load_dygraoh_no_path(self):
+        model_path = "model.test_jit_save_load.no_path"
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(ValueError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
+    def test_load_dygraph_no_var_info(self):
+        model_path = "model.test_jit_save_load.no_var_info"
+        self.train_and_save_model(model_path=model_path)
+        # remove `__variables.info__`
+        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        os.remove(var_info_path)
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(RuntimeError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
+    def test_load_dygraph_not_var_file(self):
+        model_path = "model.test_jit_save_load.no_var_file"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.params_filename = "__params__"
+        self.train_and_save_model(model_path=model_path, configs=configs)
+        new_layer = LinearNet(784, 1)
+        with self.assertRaises(RuntimeError):
+            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
+
 
 class TestJitSaveLoadConfig(unittest.TestCase):
     def setUp(self):
@@ -233,5 +276,119 @@ class TestJitSaveLoadConfig(unittest.TestCase):
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
 
+class MultiLoadingLinearNet(fluid.dygraph.Layer):
+    def __init__(self, size, model_path):
+        super(MultiLoadingLinearNet, self).__init__()
+        self._linear = Linear(size, size)
+        self._load_linear1 = fluid.dygraph.jit.load(model_path)
+        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+
+    @declarative
+    def forward(self, x):
+        tmp1 = self._linear(x)
+        tmp2 = self._load_linear1(tmp1)
+        tmp3 = self._load_linear2(tmp2)
+        y = self._linear(tmp3)
+        return y
+
+
+class TestJitMultipleLoading(unittest.TestCase):
+    def setUp(self):
+        self.linear_size = 4
+        self.model_path = "model.jit_multi_load"
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        fluid.default_main_program().random_seed = SEED
+        # train and save base model
+        self.train_and_save_orig_model()
+
+    def train_and_save_orig_model(self):
+        layer = LinearNet(self.linear_size, self.linear_size)
+        example_inputs, layer, _ = train(layer, self.linear_size, 1)
+        fluid.dygraph.jit.save(
+            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+
+    def test_load_model_retransform_inference(self):
+        multi_loaded_layer = MultiLoadingLinearNet(self.linear_size,
+                                                   self.model_path)
+        state_dict = multi_loaded_layer.state_dict()
+        name_set = set()
+        for _, var in state_dict.items():
+            self.assertTrue(var.name not in name_set)
+            name_set.add(var.name)
+
+
+class LinearNetReturnHidden(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnHidden, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        loss = fluid.layers.mean(z)
+        return y, loss
+
+
+class TestJitPruneModelAndLoad(unittest.TestCase):
+    def setUp(self):
+        self.linear_size = 4
+        self.model_path = "model.jit_prune_model_and_load"
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        fluid.default_main_program().random_seed = SEED
+
+    def train_and_save(self):
+        train_layer = LinearNetReturnHidden(8, 8)
+        adam = fluid.optimizer.AdamOptimizer(
+            learning_rate=0.1, parameter_list=train_layer.parameters())
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        for i in range(10):
+            hidden, loss = train_layer(x)
+            loss.backward()
+            adam.minimize(loss)
+            train_layer.clear_gradients()
+
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.output_spec = [hidden]
+        fluid.dygraph.jit.save(
+            layer=train_layer,
+            model_path=self.model_path,
+            input_spec=[x],
+            configs=configs)
+
+        return train_layer
+
+    def test_load_pruned_model(self):
+        train_layer = self.train_and_save()
+        train_layer.eval()
+
+        infer_layer = fluid.dygraph.jit.load(self.model_path)
+
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
+
+    def test_load_var_not_in_extra_var_info(self):
+        self.train_and_save()
+
+        # chage extra var info
+        var_info_path = os.path.join(self.model_path, EXTRA_VAR_INFO_FILENAME)
+        with open(var_info_path, 'rb') as f:
+            extra_var_info = pickle.load(f)
+            extra_var_info.clear()
+        with open(var_info_path, 'wb') as f:
+            pickle.dump(extra_var_info, f, protocol=2)
+
+        with self.assertRaises(RuntimeError):
+            fluid.dygraph.jit.load(self.model_path)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index a19b4d9c13a9e646da405babfbac98f7ed15f217..8780727e4cb276a989a8d04d05c6419a4874e7f5 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -13,6 +13,7 @@
 
 from __future__ import division
 
+import paddle
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -77,5 +78,36 @@ class TestKLDivLossOp4(TestKLDivLossOp):
         self.reduction = 'sum'
 
 
+class TestKLDivLossDygraph(unittest.TestCase):
+    def run_kl_loss(self, reduction, shape=(5, 20)):
+        x = np.random.uniform(-10, 10, shape).astype('float64')
+        target = np.random.uniform(-10, 10, shape).astype('float64')
+        gt_loss = kldiv_loss(x, target, reduction)
+
+        with paddle.fluid.dygraph.guard():
+            kldiv_criterion = paddle.nn.KLDivLoss(reduction)
+            pred_loss = kldiv_criterion(
+                paddle.to_variable(x), paddle.to_variable(target))
+            self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
+
+    def test_kl_loss_batchmean(self):
+        self.run_kl_loss('batchmean')
+
+    def test_kl_loss_mean(self):
+        self.run_kl_loss('mean')
+
+    def test_kl_loss_sum(self):
+        self.run_kl_loss('sum')
+
+    def test_kl_loss_none(self):
+        self.run_kl_loss('none')
+
+    def test_kl_loss_static_api(self):
+        input = paddle.fluid.data(name='input', shape=[5, 20])
+        label = paddle.fluid.data(name='label', shape=[5, 20])
+
+        pred_loss = paddle.nn.functional.kl_div(input, label)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index d7e801a666f110471180d704a6fda6dc0f9aeb1e..6a15fe494779f93c2f36a301594aaccf55283902 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -20,111 +20,165 @@ import numpy as np
 import unittest
 
 
-class TestL1Loss(unittest.TestCase):
-    def test_L1Loss_mean(self):
-        input_np = np.random.random(size=(10, 1)).astype(np.float32)
-        label_np = np.random.random(size=(10, 1)).astype(np.float32)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.layers.data(
-                name='input', shape=[10, 1], dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[10, 1], dtype='float32')
-            l1_loss = paddle.nn.loss.L1Loss()
-            ret = l1_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[ret])
-
-        with fluid.dygraph.guard():
-            l1_loss = paddle.nn.loss.L1Loss()
-            dy_ret = l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_ret.numpy()
-
-        expected = np.mean(np.abs(input_np - label_np))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+class TestFunctionalL1Loss(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+        self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_variable(self.input_np)
+        label = paddle.to_variable(self.label_np)
+        dy_result = paddle.nn.functional.l1_loss(input, label)
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
-    def test_L1Loss_sum(self):
-        input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
-        label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.layers.data(
+        dy_result = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.l1_loss(input, label, reduction='none')
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [10, 10, 5])
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.data(name='label', shape=[10, 10, 5], dtype='float32')
+        result0 = paddle.nn.functional.l1_loss(input, label)
+        result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+        result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
+        y = paddle.nn.functional.l1_loss(input, label, name='aaa')
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(static_result[2], expected))
+
+        self.assertTrue('aaa' in y.name)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+    # test case the raise message
+    def test_errors(self):
+        def test_value_error():
+            input = paddle.data(
                 name='input', shape=[10, 10, 5], dtype='float32')
-            label = fluid.layers.data(
+            label = paddle.data(
                 name='label', shape=[10, 10, 5], dtype='float32')
-            l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
-            ret = l1_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[ret])
-
-        with fluid.dygraph.guard():
-            l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
-            dy_ret = l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_ret.numpy()
-
-        expected = np.sum(np.abs(input_np - label_np))
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
+            loss = paddle.nn.functional.l1_loss(
+                input, label, reduction='reduce_mean')
+
+        self.assertRaises(ValueError, test_value_error)
+
+
+class TestClassL1Loss(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+        self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_variable(self.input_np)
+        label = paddle.to_variable(self.label_np)
+        l1_loss = paddle.nn.loss.L1Loss()
+        dy_result = l1_loss(input, label)
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [1])
+
+        l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
+        dy_result = l1_loss(input, label)
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
-    def test_L1Loss_none(self):
-        input_np = np.random.random(size=(10, 5)).astype(np.float32)
-        label_np = np.random.random(size=(10, 5)).astype(np.float32)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.layers.data(
-                name='input', shape=[10, 5], dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[10, 5], dtype='float32')
-            l1_loss = paddle.nn.loss.L1Loss(reduction='none')
-            ret = l1_loss(input, label)
-
-            exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[ret])
-
-        with fluid.dygraph.guard():
-            l1_loss = paddle.nn.loss.L1Loss(reduction='none')
-            dy_ret = l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_result = dy_ret.numpy()
-
-        expected = np.abs(input_np - label_np)
-        self.assertTrue(np.allclose(static_result, expected))
-        self.assertTrue(np.allclose(static_result, dy_result))
-        self.assertTrue(np.allclose(dy_result, expected))
-        self.assertTrue(dy_result.shape, input.shape)
+        l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+        dy_result = l1_loss(input, label)
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected))
+        self.assertTrue(dy_result.shape, [10, 10, 5])
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.data(name='label', shape=[10, 10, 5], dtype='float32')
+        l1_loss = paddle.nn.loss.L1Loss()
+        result0 = l1_loss(input, label)
+        l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
+        result1 = l1_loss(input, label)
+        l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+        result2 = l1_loss(input, label)
+        l1_loss = paddle.nn.loss.L1Loss(name='aaa')
+        result3 = l1_loss(input, label)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "label": self.label_np},
+            fetch_list=[result0, result1, result2])
+
+        expected = np.mean(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = np.sum(np.abs(self.input_np - self.label_np))
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = np.abs(self.input_np - self.label_np)
+        self.assertTrue(np.allclose(static_result[2], expected))
+        self.assertTrue('aaa' in result3.name)
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+    # test case the raise message
+    def test_errors(self):
+        def test_value_error():
+            loss = paddle.nn.loss.L1Loss(reduction="reduce_mean")
+
+        self.assertRaises(ValueError, test_value_error)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
index f1bf6395f15ce0d5ce49eff241a752e4847d9d17..98c907a551965331f79d1635362213b43d867002 100644
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -3,7 +3,7 @@ set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
 launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-python ${launch_py} multi_process.py
+python ${launch_py} multi_process.py launch
 
 # use paddlecloud
 echo "begin test use paddlecloud"
@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
 export TRAINER_PORTS_NUM=2
 
 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
+CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch
 
 str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
 str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
-file_0="multi_process.check_0.log"
-file_1="multi_process.check_1.log"
+file_0="multi_process_launch.check_0.log"
+file_1="multi_process_launch.check_1.log"
 
 echo "paddlecloud params test"
 if grep -q "$str1" "$file_0"; then
@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
-if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then
+if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
     echo "train abort as planned"
 fi
 
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f324e4bd377c616fb14b2b6df2b936b04ed76ff5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphLayerNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a1ead2aef63f7b186ed2d5e8a6598349ae50509d..1992a3bb39807a62966e245d24888cc074746e8d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -180,6 +180,51 @@ class TestLayer(LayerTest):
 
             self.assertRaises(TypeError, test_type)
 
+    def test_Flatten(self):
+        inp = np.ones([3, 4, 4, 5], dtype='float32')
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 4, 4, 5],
+                dtype='float32',
+                append_batch_size=False)
+            flatten = nn.Flatten()
+            ret = flatten(t)
+            static_ret = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.dynamic_graph():
+            t = base.to_variable(inp)
+            flatten = nn.Flatten()
+            dy_ret = flatten(t)
+            dy_ret_value = dy_ret.numpy()
+
+        self.assertTrue(np.array_equal(static_ret, dy_ret_value))
+
+        with self.static_graph():
+
+            # the input of Linear must be Variable.
+            def test_Variable():
+                inp = np.ones([3, 32, 32], dtype='float32')
+                linear = nn.Linear(
+                    32,
+                    4,
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                linear_ret1 = linear(inp)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            # the input dtype of Linear must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            def test_type():
+                inp = np.ones([3, 32, 32], dtype='int32')
+                linear = nn.Linear(
+                    32,
+                    4,
+                    bias_attr=fluid.initializer.ConstantInitializer(value=1))
+                linear_ret2 = linear(inp)
+
+            self.assertRaises(TypeError, test_type)
+
     def test_layer_norm(self):
         inp = np.ones([3, 32, 32], dtype='float32')
         with self.static_graph():
@@ -238,6 +283,24 @@ class TestLayer(LayerTest):
             with self.assertRaises(ValueError):
                 lm(base.to_variable(inp))
 
+    def test_SyncBatchNorm(self):
+        if core.is_compiled_with_cuda():
+            with self.static_graph():
+                t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
+                my_sync_bn = paddle.nn.SyncBatchNorm(3)
+                ret = my_sync_bn(t)
+                static_ret = self.get_static_graph_result(
+                    feed={'t': np.ones(
+                        [3, 3, 5, 5], dtype='float32')},
+                    fetch_list=[ret])[0]
+
+            with self.dynamic_graph():
+                t = np.ones([3, 3, 5, 5], dtype='float32')
+                my_syncbn = paddle.nn.SyncBatchNorm(3)
+                dy_ret = my_syncbn(base.to_variable(t))
+                dy_ret_value = dy_ret.numpy()
+            self.assertTrue(np.array_equal(static_ret, static_ret))
+
     def test_relu(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -253,21 +316,6 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
 
-    def test_leakyrelu(self):
-        inputs = np.random.uniform(-1, 1, (10, 10)).astype('float32')
-        with self.static_graph():
-            t = layers.data(name='t', shape=[10, 10], dtype='float32')
-            ret = layers.leaky_relu(t, alpha=0.01)
-            static_ret = self.get_static_graph_result(
-                feed={'t': inputs}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            lrelu = paddle.nn.LeakyReLU(alpha=0.01)
-            dy_ret = lrelu(base.to_variable(inputs))
-            dy_ret_value = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-
     def test_pad2d(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
@@ -2615,13 +2663,6 @@ class TestBook(LayerTest):
             out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
             return (out)
 
-    def make_leaky_relu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.leaky_relu(input, alpha=0.1, name='leaky_relu')
-            return (out)
-
     def make_soft_relu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -3641,5 +3682,32 @@ class TestBook(LayerTest):
                         batch_first=batch_first)
 
 
+class TestMetricsDetectionMap(unittest.TestCase):
+    def test_detection_map(self):
+        program = fluid.Program()
+        with program_guard(program):
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 1],
+                append_batch_size=False,
+                dtype='float32')
+            box = fluid.layers.data(
+                name='bbox',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            map_eval = fluid.metrics.DetectionMAP(
+                detect_res, label, box, class_num=21)
+            cur_map, accm_map = map_eval.get_map_var()
+            self.assertIsNotNone(cur_map)
+            self.assertIsNotNone(accm_map)
+        print(str(program))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 71b452d4a2dd192c756599eb24949084bfa0860e..9a2e7b85e5202288b62a640e41e06f131b0cba84 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -19,6 +19,7 @@ import math
 import numpy as np
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
@@ -553,79 +554,459 @@ def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
 
 
 class TestReduceLROnPlateauDecay(unittest.TestCase):
-    def test_dygraph_mode(self):
-        with fluid.dygraph.guard():
-            # the decay rate must be less than 1.0
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(
-                    learning_rate=1.0, decay_rate=2.0)
-            # the mode must be "min" or "max"
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(learning_rate=1.0, mode="test")
-            # the threshold_mode must be "rel" or "abs"
-            with self.assertRaises(ValueError):
-                fluid.dygraph.ReduceLROnPlateau(
-                    learning_rate=1.0, threshold_mode="test")
-
-            base_lr = 1.0
-            patience = 3
-            cooldown = 1
-            decay_rate = 0.5
-            threshold = 1e-4
-            linear = fluid.dygraph.Linear(10, 10)
+    def test_ReduceLR(self):
+        # the decay rate must be less than 1.0
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
+        # the mode must be "min" or "max"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
+        # the threshold_mode must be "rel" or "abs"
+        with self.assertRaises(ValueError):
+            paddle.optimizer.ReduceLROnPlateau(
+                learning_rate=1.0, threshold_mode="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
+        with self.assertRaises(TypeError):
+            paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
 
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for place in places:
             for m, n in zip(['min', 'max', 'min', 'max'],
                             ['rel', 'rel', 'abs', 'abs']):
                 kwargs = {
-                    'learning_rate': base_lr,
-                    'decay_rate': decay_rate,
-                    'threshold': threshold,
-                    'verbose': True,
-                    'patience': patience,
-                    'cooldown': cooldown,
+                    'learning_rate': 1.0,
                     'mode': m,
+                    'factor': 0.5,
+                    'patience': 3,
+                    'threshold': 1e-4,
                     'threshold_mode': n,
-                    'eps': 1e-6
+                    'cooldown': 1,
+                    'min_lr': 0,
+                    'epsilon': 1e-8,
+                    'verbose': False,
                 }
-                print("class=" + fluid.dygraph.ReduceLROnPlateau.__name__ +
-                      " kwargs=" + str(kwargs))
-                lr = fluid.dygraph.ReduceLROnPlateau(**kwargs)
-                sgd = fluid.optimizer.SGD(learning_rate=lr,
-                                          parameter_list=linear.parameters())
-
-                best = float("-10000") if m == "max" else float("10000")
-                expected_lr = 1.0
-                cooldown_counter = 0
-                num_bad_epochs = 0
-                var_list = [best, expected_lr, cooldown_counter, num_bad_epochs]
-                step_num = 0
-                epoch_num = 0
-                for epoch in range(30):
-                    total_loss = 0
-
-                    for batch_id in range(2):
-                        step_num += 1
-                        x = fluid.dygraph.to_variable(
-                            np.array([step_num]).astype('float32'))
-                        loss = layers.sin(x)
-                        sgd.minimize(loss)
-                        total_loss += loss
-
-                    epoch_num += 1
-                    # get expected lr from fluid
-                    avg_loss = total_loss / 1
-                    lr.step(avg_loss)
-                    actual_lr = lr().numpy()[0]
-
-                    # get expected lr form python
-                    expected_lr = reduce_lr_on_plateau(decay_rate, threshold,
-                                                       cooldown, patience, m, n,
-                                                       avg_loss, var_list)
-                    self.assertEqual(
-                        expected_lr,
-                        actual_lr,
-                        msg='Failed reduce lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
-                        format(epoch_num, expected_lr, actual_lr))
+                paddle.enable_static()
+                self._test_static(place, kwargs)
+                paddle.disable_static(place)
+                self._test_dygraph(place, kwargs)
+                paddle.enable_static()
+
+    def _test_static(self, place, kwargs):
+        paddle.enable_static()
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        main_prog = fluid.Program()
+        start_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            x = fluid.layers.create_global_var(
+                [1], 1, 'float32', persistable=True)
+            paddle.increment(x)
+            loss = paddle.sin(x)
+            scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+            adam = fluid.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        exe = fluid.Executor(place)
+        exe.run(start_prog)
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(main_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+        for epoch in range(10):
+            for batch_id in range(1):
+                out, actual_lr = exe.run(test_prog,
+                                         fetch_list=[loss.name, lr_var.name])
+                expected_lr = reduce_lr_on_plateau(
+                    kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                    kwargs['patience'], kwargs['mode'],
+                    kwargs['threshold_mode'], out[0], var_list)
+            scheduler.step(out[0])
+            actual_lr = scheduler()
+            self.assertEqual(actual_lr, np.array(expected_lr))
+
+    def _test_dygraph(self, place, kwargs):
+        paddle.disable_static(place)
+
+        best = float("-10000") if kwargs['mode'] == "max" else float("10000")
+        current_lr = 1.0
+        cooldown_counter = 0
+        num_bad_epochs = 0
+        var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
+
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        sgd = paddle.optimizer.SGD(learning_rate=scheduler,
+                                   parameter_list=linear.parameters())
+
+        for epoch in range(20):
+            for batch_id in range(1):
+                x = paddle.to_tensor(epoch).astype('float32')
+                loss = paddle.sin(x)
+                loss.backward()
+                sgd.minimize(loss)
+
+            scheduler.step(loss)
+            # get lr from paddle
+            current_lr = scheduler()
+            # get lr form python
+            expected_lr = reduce_lr_on_plateau(
+                kwargs['factor'], kwargs['threshold'], kwargs['cooldown'],
+                kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'],
+                loss, var_list)
+            self.assertEqual(current_lr, expected_lr)
+        state_dict = sgd.state_dict()
+        scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        sgd1 = paddle.optimizer.SGD(learning_rate=scheduler1,
+                                    parameter_list=linear.parameters())
+        sgd1.set_dict(state_dict)
+        self.assertEqual(scheduler.cooldown_counter,
+                         scheduler1.cooldown_counter)
+        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
+        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
+        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
+        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
+
+
+def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False):
+    if epoch_num == 0:
+        a = 1
+    else:
+        a = math.pow(epoch_num, -0.5)
+    b = math.pow(warmup_steps, -1.5) * epoch_num
+    return learning_rate * math.pow(d_model, -0.5) * min(a, b)
+
+
+def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
+    return learning_rate * lr_lambda(epoch_num)
+
+
+def piecewise_lr(epoch_num, boundaries, values, verbose=False):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if epoch_num < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
+
+
+def exponential_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * gamma**epoch_num
+
+
+def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate * math.exp(-1 * gamma * epoch_num)
+
+
+def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False):
+    return learning_rate / (1 + gamma * epoch_num)
+
+
+def polynomial_lr(epoch_num,
+                  learning_rate,
+                  decay_steps,
+                  end_lr=0.0001,
+                  power=1.0,
+                  cycle=False,
+                  verbose=False):
+
+    if cycle:
+        div = math.ceil(epoch_num / float(decay_steps))
+        if epoch_num == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        epoch_num = min(epoch_num, decay_steps)
+    return (learning_rate - end_lr) * (
+        (1 - float(epoch_num) / float(decay_steps))**power) + end_lr
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+
+cosine_annealing_lr_current = None
+
+
+def cosine_annealing_lr(epoch_num,
+                        learning_rate,
+                        T_max,
+                        eta_min=0,
+                        verbose=False):
+    global cosine_annealing_lr_current
+    if epoch_num == 0:
+        cosine_annealing_lr_current = learning_rate
+    elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
+        cosine_annealing_lr_current = cosine_annealing_lr_current + (
+            learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max))
+                                        ) / 2
+    else:
+        cosine_annealing_lr_current = (1 + math.cos(
+            math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * (
+                epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current -
+                                                   eta_min) + eta_min
+    return cosine_annealing_lr_current
+
+
+def linear_warmup_lr(epoch_num,
+                     learning_rate,
+                     warmup_steps,
+                     start_lr,
+                     end_lr,
+                     verbose=False):
+    if epoch_num < warmup_steps:
+        return start_lr + (end_lr - start_lr) * (float(epoch_num) /
+                                                 float(warmup_steps))
+    else:
+        return learning_rate
+
+
+def multi_step_lr(epoch_num,
+                  learning_rate,
+                  milestones,
+                  gamma=0.1,
+                  verbose=False):
+    for i in range(len(milestones)):
+        if epoch_num < milestones[i]:
+            return learning_rate * (gamma**i)
+    return learning_rate * (gamma**len(milestones))
+
+
+def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
+    return learning_rate * math.pow(gamma, epoch_num // step_size)
+
+
+class TestLRScheduler(unittest.TestCase):
+    def _test_static(self, python_func, paddle_api, kwarg, place):
+        main_prog = fluid.Program()
+        start_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            x = fluid.data(name='x', shape=[3, 4, 5])
+            y = fluid.data(name='y', shape=[3, 4, 5])
+            z = fluid.layers.fc(x, 100)
+            loss = fluid.layers.mean(z)
+            scheduler = paddle_api(**kwarg)
+            adam = fluid.optimizer.Adam(learning_rate=scheduler)
+            adam.minimize(loss)
+            lr_var = adam._global_learning_rate()
+            test_prog = main_prog.clone()
+
+        num = 0
+        exe = fluid.Executor(place)
+        exe.run(start_prog)
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    main_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        for epoch in range(5):
+            for batch_id in range(2):
+                out = exe.run(
+                    test_prog,
+                    feed={
+                        'x': np.random.randn(3, 4, 5).astype('float32'),
+                        'y': np.random.randn(3, 4, 5).astype('float32')
+                    },
+                    fetch_list=lr_var.name)
+            self.assertEqual(out, np.array(python_func(num, **kwarg)))
+            scheduler.step()
+            num += 1
+
+        if isinstance(place, fluid.CPUPlace):
+            compiled_train_prog = fluid.CompiledProgram(
+                main_prog).with_data_parallel(
+                    loss_name=loss.name, places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_train_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_train_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+            compiled_test_prog = fluid.CompiledProgram(
+                test_prog).with_data_parallel(
+                    loss_name=loss.name,
+                    share_vars_from=compiled_train_prog,
+                    places=fluid.cpu_places(4))
+            for epoch in range(5):
+                python_result = python_func(num, **kwarg)
+                for batch_id in range(2):
+                    _ = exe.run(
+                        compiled_test_prog,
+                        feed={
+                            'x': np.random.randn(12, 4, 5).astype('float32'),
+                            'y': np.random.randn(12, 4, 5).astype('float32')
+                        },
+                        fetch_list=lr_var.name)
+                scopes = compiled_test_prog._executor.local_scopes()
+                out = np.array(scopes[0].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[1].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[2].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                out = np.array(scopes[3].var(lr_var.name).get_tensor())
+                self.assertEqual(out, np.array(python_result))
+                scheduler.step()
+                num += 1
+
+    def _test_dygraph(self, python_func, paddle_api, kwarg, place):
+        x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+        linear = paddle.nn.Linear(10, 10)
+        scheduler = paddle_api(**kwarg)
+        sgd = paddle.optimizer.SGD(learning_rate=scheduler,
+                                   parameter_list=linear.parameters())
+        for epoch in range(20):
+            for batch_id in range(2):
+                x = paddle.to_tensor(x)
+                out = linear(x)
+                loss = paddle.reduce_mean(out)
+                out.backward()
+                sgd.minimize(loss)
+                linear.clear_gradients()
+
+            self.assertAlmostEqual(sgd.current_step_lr(),
+                                   python_func(epoch, **kwarg))
+            if paddle_api.__name__ != "CosineAnnealingLR":
+                scheduler.step()
+            else:
+                scheduler.step(epoch + 1)
+
+    def test_scheduler(self):
+        with self.assertRaises(NotImplementedError):
+            paddle.optimizer.lr_scheduler._LRScheduler().step()
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate="test", milestones=[1, 2, 3])
+        with self.assertRaises(TypeError):
+            paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[3, 2, 1])
+        with self.assertRaises(ValueError):
+            paddle.optimizer.MultiStepLR(
+                learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+
+        func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
+            "d_model": 0.01,
+            "warmup_steps": 100,
+            "verbose": False
+        }), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
+            "boundaries": [3, 6, 9, 15, 20],
+            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "verbose": False
+        }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": False
+        }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.1,
+            "verbose": True
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": False,
+            "verbose": False
+        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+            "learning_rate": 0.5,
+            "decay_steps": 20,
+            "end_lr": 0,
+            "power": 1.0,
+            "cycle": True,
+            "verbose": False
+        }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
+            'learning_rate': 0.5,
+            'warmup_steps': 20,
+            'start_lr': 0,
+            'end_lr': 0.5,
+            "verbose": False
+        }), (exponential_lr, paddle.optimizer.ExponentialLR, {
+            "learning_rate": 0.5,
+            "gamma": 0.9,
+            "verbose": False
+        }), (multi_step_lr, paddle.optimizer.MultiStepLR, {
+            "learning_rate": 0.5,
+            "milestones": [3, 6, 9, 15, 20],
+            "gamma": 0.8,
+            "verbose": True
+        }), (step_lr, paddle.optimizer.StepLR, {
+            "learning_rate": 0.5,
+            "step_size": 2,
+            "gamma": 0.8,
+            "verbose": False
+        }), (lambda_lr, paddle.optimizer.LambdaLR, {
+            "learning_rate": 0.5,
+            "lr_lambda": lambda x: 0.95**x,
+            "verbose": False
+        }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
+            "learning_rate": 0.5,
+            "T_max": 10,
+            "verbose": True
+        })]
+
+        for python_func, paddle_api, kwarg in func_api_kwargs:
+            places = [fluid.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(fluid.CUDAPlace(0))
+
+            for place in places:
+                paddle.enable_static()
+                self._test_static(python_func, paddle_api, kwarg, place)
+                paddle.disable_static(place)
+                self._test_dygraph(python_func, paddle_api, kwarg, place)
+                paddle.enable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index c7bab1a135bc439eefa822087869e08a43de0c51..6d1f42111eebff0f469317ddf2a9ec7698a7ae1e 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -32,6 +32,7 @@ class TestLinspaceOpCommonCase(OpTest):
             'Stop': np.array([10]).astype(dtype),
             'Num': np.array([11]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
 
@@ -48,6 +49,7 @@ class TestLinspaceOpReverseCase(OpTest):
             'Stop': np.array([0]).astype(dtype),
             'Num': np.array([11]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
 
@@ -64,6 +66,7 @@ class TestLinspaceOpNumOneCase(OpTest):
             'Stop': np.array([0]).astype(dtype),
             'Num': np.array([1]).astype('int32')
         }
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
 
         self.outputs = {'Out': np.array(10, dtype=dtype)}
 
@@ -72,6 +75,26 @@ class TestLinspaceOpNumOneCase(OpTest):
 
 
 class TestLinspaceAPI(unittest.TestCase):
+    def test_variable_input1(self):
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        out = paddle.linspace(start, stop, num, dtype='float32')
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res = exe.run(fluid.default_main_program(), fetch_list=[out])
+        np_res = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((res == np_res).all(), True)
+
+    def test_variable_input2(self):
+        paddle.disable_static()
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        out = paddle.linspace(start, stop, num, dtype='float32')
+        np_res = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((out.numpy() == np_res).all(), True)
+        paddle.enable_static()
+
     def test_dtype(self):
         out_1 = paddle.linspace(0, 10, 5, dtype='float32')
         out_2 = paddle.linspace(0, 10, 5, dtype=np.float32)
@@ -82,16 +105,23 @@ class TestLinspaceAPI(unittest.TestCase):
         assert np.array_equal(res_1, res_2)
 
     def test_name(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             out = paddle.linspace(
                 0, 10, 5, dtype='float32', name='linspace_res')
             assert 'linspace_res' in out.name
 
     def test_imperative(self):
-        with paddle.imperative.guard():
-            out = paddle.linspace(0, 10, 5, dtype='float32')
-            np_out = np.linspace(0, 10, 5, dtype='float32')
-        self.assertEqual((out.numpy() == np_out).all(), True)
+        paddle.disable_static()
+        out1 = paddle.linspace(0, 10, 5, dtype='float32')
+        np_out1 = np.linspace(0, 10, 5, dtype='float32')
+        out2 = paddle.linspace(0, 10, 5, dtype='int32')
+        np_out2 = np.linspace(0, 10, 5, dtype='int32')
+        out3 = paddle.linspace(0, 10, 200, dtype='int32')
+        np_out3 = np.linspace(0, 10, 200, dtype='int32')
+        paddle.enable_static()
+        self.assertEqual((out1.numpy() == np_out1).all(), True)
+        self.assertEqual((out2.numpy() == np_out2).all(), True)
+        self.assertEqual((out3.numpy() == np_out3).all(), True)
 
 
 class TestLinspaceOpError(unittest.TestCase):
@@ -99,7 +129,12 @@ class TestLinspaceOpError(unittest.TestCase):
         with program_guard(Program(), Program()):
 
             def test_dtype():
-                fluid.layers.linspace(0, 10, 1, dtype="int32")
+                fluid.layers.linspace(0, 10, 1, dtype="int8")
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_dtype():
+                fluid.layers.linspace(0, 10, 1.33, dtype="int32")
 
             self.assertRaises(TypeError, test_dtype)
 
@@ -119,20 +154,20 @@ class TestLinspaceOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_step_dtype)
 
             def test_start_dtype():
-                start = fluid.data(shape=[1], type="int32", name="start")
+                start = fluid.data(shape=[1], dtype="int32", name="start")
                 fluid.layers.linspace(start, 10, 1, dtype="float32")
 
             self.assertRaises(TypeError, test_start_dtype)
 
             def test_end_dtype():
-                end = fluid.data(shape=[1], type="int32", name="end")
+                end = fluid.data(shape=[1], dtype="int32", name="end")
                 fluid.layers.linspace(0, end, 1, dtype="float32")
 
             self.assertRaises(TypeError, test_end_dtype)
 
-            def test_step_dtype():
-                step = fluid.data(shape=[1], type="int32", name="step")
-                fluid.layers.linspace(0, 10, step, dtype="float32")
+            def test_num_dtype():
+                num = fluid.data(shape=[1], dtype="int32", name="step")
+                fluid.layers.linspace(0, 10, num, dtype="float32")
 
             self.assertRaises(TypeError, test_step_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 2b77624734d335bd999754b378971bcc5c945fa5..e3d7003ecedb60f9b4f9a542ed08ca88d894d24a 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,93 +14,136 @@
 
 import unittest
 import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.nn as nn
-import paddle.nn.functional as functional
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
 
+np.random.seed(10)
 
-def stable_softmax(x):
+
+def ref_log_softmax(x):
     shiftx = (x - np.max(x))
-    exps = np.exp(shiftx)
-    return exps / np.sum(exps)
+    out = shiftx - np.log(np.exp(shiftx).sum())
+    return out
 
 
-def ref_log_softmax(x, axis=None, dtype=None):
-    x_t = x.copy()
-    if dtype is not None:
-        x_t = x_t.astype(dtype)
-    if axis is None:
-        axis = -1
-    out = np.apply_along_axis(stable_softmax, axis, x_t)
-    return np.log(out)
+def ref_log_softmax_grad(x, axis):
+    if axis < 0:
+        axis += len(x.shape)
+    out = np.apply_along_axis(ref_log_softmax, axis, x)
+    axis_dim = x.shape[axis]
+    dout = np.full_like(x, fill_value=1. / x.size)
+    dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat(
+        axis_dim, axis=axis)
+    return dx
 
 
-class TestNNLogSoftmaxAPI(unittest.TestCase):
+class TestLogSoftmaxOp(OpTest):
     def setUp(self):
-        self.init_data()
+        self.op_type = 'log_softmax'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
 
-    def init_data(self):
-        self.x_shape = [2, 3, 4, 5]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
+class TestLogSoftmaxShape(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [12, 10]
 
-    def check_api(self, place=fluid.CPUPlace(), axis=None):
-        ref_out = ref_log_softmax(self.x, axis)
 
-        main_program = fluid.Program()
-        mylogsoftmax = nn.LogSoftmax(axis)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = mylogsoftmax(x)
-        exe = fluid.Executor(place)
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 1
+
+
+class TestNNLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def check_api(self, axis=-1):
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x)
+
+        logsoftmax = paddle.nn.LogSoftmax(axis)
+        # test static api
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data(name='x', shape=self.x_shape)
+            y = logsoftmax(x)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
         self.assertTrue(np.allclose(out[0], ref_out))
 
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = mylogsoftmax(x)
+        # test dygrapg api
+        paddle.disable_static()
+        x = paddle.to_variable(self.x)
+        y = logsoftmax(x)
         self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.enable_static()
 
     def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for axis in [None, 2]:
-                self.check_api(place, axis)
+        for axis in [-1, 1]:
+            self.check_api(axis)
 
 
 class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
     def setUp(self):
-        self.init_data()
-
-    def init_data(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-
-    def check_api(self, place=fluid.CPUPlace(), axis=None, dtype=None):
-        ref_out = ref_log_softmax(self.x, axis, dtype)
-        main_program = fluid.Program()
-        mylogsoftmax = nn.LogSoftmax(axis)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.log_softmax(x, axis, dtype)
-        exe = fluid.Executor(place)
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+        self.place = paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def check_api(self, axis=-1, dtype=None):
+        x = self.x.copy()
+        if dtype is not None:
+            x = x.astype(dtype)
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data(name='x', shape=self.x_shape)
+            y = F.log_softmax(x, axis, dtype)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
         self.assertTrue(np.allclose(out[0], ref_out))
 
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = functional.log_softmax(x, axis, dtype)
-        self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.disable_static()
+        x = paddle.to_variable(self.x)
+        y = F.log_softmax(x, axis, dtype)
+        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
+        paddle.enable_static()
 
     def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self.check_api(place, None, None)
-            self.check_api(place, None, np.float64)
+        for axis in [-1, 1]:
+            self.check_api(axis)
+        self.check_api(-1, 'float64')
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data(name='X1', shape=[100], dtype='int32')
+            self.assertRaises(TypeError, F.log_softmax, x)
+
+            x = paddle.data(name='X2', shape=[100], dtype='float32')
+            self.assertRaises(TypeError, F.log_softmax, x, dtype='int32')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
old mode 100644
new mode 100755
index 8f0049a8d30d0e1fed1d27cf6e13c036e33678d0..b26b6ab6c3ce7cc68ad877b183eb8733293b9228
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -17,8 +17,9 @@ from __future__ import print_function
 import op_test
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from paddle.static import Program, program_guard
 
 
 def create_test_class(op_type, callback, binary_op=True):
@@ -42,6 +43,8 @@ def create_test_class(op_type, callback, binary_op=True):
 
         def test_error(self):
             with program_guard(Program(), Program()):
+
+                # test 1 type error, x, y must be bool type
                 x = fluid.layers.data(name='x', shape=[2], dtype='bool')
                 y = fluid.layers.data(name='y', shape=[2], dtype='bool')
                 a = fluid.layers.data(name='a', shape=[2], dtype='int32')
@@ -54,7 +57,16 @@ def create_test_class(op_type, callback, binary_op=True):
                     self.assertRaises(TypeError, op, x=x, out=1)
                     self.assertRaises(TypeError, op, x=a)
 
-    Cls.__name__ = op_type
+                # test 2 type error, x, y must be same shape
+                x_data = fluid.layers.data(
+                    name='x_data', shape=[2], dtype='bool')
+                y_data = fluid.layers.data(
+                    name='y_data', shape=[2, 2], dtype='bool')
+
+                if self.op_type != "logical_not":
+                    self.assertRaises(TypeError, op, x=x_data, y=y_data, out=1)
+                    self.assertRaises(TypeError, op, x=y_data, y=x_data)
+
     globals()[op_type] = Cls
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index 508b4a7b72da8affbc7ddf590b8142a41d1f3191..c2201a52605bc87246fb9c8734494b19f83ff180 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -12,64 +12,128 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import paddle
-import paddle.fluid as fluid
 import unittest
 import numpy as np
 from op_test import OpTest
-from paddle.fluid import Program, program_guard
-from paddle.fluid.layer_helper import LayerHelper
 
 
-class TestLogSumOpError(unittest.TestCase):
+def ref_logsumexp(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, int):
+        axis = (axis, )
+    elif isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    out = np.log(np.exp(x).sum(axis=axis, keepdims=keepdim))
+    return out
+
+
+class TestLogsumexp(OpTest):
+    def setUp(self):
+        self.op_type = 'logsumexp'
+        self.shape = [2, 3, 4, 5]
+        self.dtype = 'float64'
+        self.axis = [-1]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = ref_logsumexp(x, self.axis, self.keepdim, self.reduce_all)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+class TestLogsumexp_shape(TestLogsumexp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+class TestLogsumexp_axis(TestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, -1]
+
+
+class TestLogsumexp_axis_all(TestLogsumexp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestLogsumexp_keepdim(TestLogsumexp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestLogsumexp_reduce_all(TestLogsumexp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+class TestLogsumexpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            x1 = fluid.layers.data(name='x1', shape=[120], dtype="uint8")
-            self.assertRaises(Exception, paddle.logsumexp, x1)
-
-            x2 = fluid.layers.data(name='x2', shape=[2, 3], dtype="int")
-            self.assertRaises(Exception, paddle.logsumexp, x2)
-
-            x3 = fluid.layers.data(name='x3', shape=[3], dtype="float16")
-            self.assertRaises(Exception, paddle.logsumexp, x3)
-
-            self.assertRaises(AssertionError, paddle.logsumexp, None)
-
-
-class TestLogSumExpOp(unittest.TestCase):
-    def test_dygraph(self):
-        with fluid.dygraph.guard():
-            np_x = np.random.uniform(0.1, 1, [123]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(x).numpy(), np.log(np.sum(np.exp(np_x)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, dim=[1, 2]).numpy(),
-                    np.log(np.sum(np.exp(np_x), axis=(1, 2)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, dim=[2]).numpy(),
-                    np.log(np.sum(np.exp(np_x), axis=(2)))))
-
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            self.assertTrue(
-                np.allclose(
-                    paddle.logsumexp(
-                        x, keepdim=True).numpy(),
-                    np.log(np.sum(np.exp(np_x), keepdims=True))))
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.assertRaises(TypeError, paddle.logsumexp, 1)
+            x1 = paddle.data(name='x1', shape=[120], dtype="int32")
+            self.assertRaises(TypeError, paddle.logsumexp, x1)
+
+
+class TestLogsumexpAPI(unittest.TestCase):
+    def setUp(self):
+        self.shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def api_case(self, axis=None, keepdim=False):
+        out_ref = ref_logsumexp(self.x, axis, keepdim)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape)
+            out = paddle.logsumexp(x, axis, keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(res[0], out_ref))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x)
+        out = paddle.logsumexp(x, axis, keepdim)
+        self.assertTrue(np.allclose(out.numpy(), out_ref))
+        paddle.enable_static()
+
+    def test_api(self):
+        self.api_case()
+        self.api_case(2)
+        self.api_case([-1])
+        self.api_case([2, -3])
+        self.api_case((0, 1, -1))
+        self.api_case(keepdim=True)
+
+    def test_alias(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_variable(self.x)
+        out1 = paddle.logsumexp(x)
+        out2 = paddle.tensor.logsumexp(x)
+        out3 = paddle.tensor.math.logsumexp(x)
+        out_ref = ref_logsumexp(self.x)
+        for out in [out1, out2, out3]:
+            self.assertTrue(np.allclose(out.numpy(), out_ref))
+        paddle.enable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index d4189eca0369702120f079dae9067a58da1e9597..90430bbce4d1896c8fdbb829230f2ad8a691adff 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -20,15 +20,14 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 
 SIGMOID_THRESHOLD_MIN = -40.0
 SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
-def lstm_naive(
-        input,
-        w, ):
+def lstm_naive(input, w):
     seq_len, batch_size, hidden_size = input.shape
 
     offset = 0
@@ -86,8 +85,8 @@ def lstm_naive(
         return (2. / (1. + np.exp(y))) - 1.
 
     output = []
-    pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
-    pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+    pre_h = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
+    pre_c = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
 
     for i in range(seq_len):
         emb_1 = input[i]
@@ -110,7 +109,6 @@ def lstm_naive(
 
     output = np.concatenate(output, -1)
     output = output.reshape((batch_size, -1, hidden_size))
-
     output = output.transpose((1, 0, 2))
 
     return output, pre_h, pre_c
@@ -119,11 +117,12 @@ def lstm_naive(
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
+    # TODO(GaoWei8):when input dtype is fp64, precision threshold should be removed.
     def setUp(self):
         self.op_type = "cudnn_lstm"
-        self.dtype = np.float32
+        self.dtype = np.float64
 
-        num_steps = 20
+        seq_length = 20
         batch_size = 5
         hidden_size = 20
 
@@ -133,33 +132,24 @@ class TestCUDNNLstmOp(OpTest):
         weight_size += hidden_size * 8
 
         input = np.random.uniform(
-            low=-0.1, high=0.1, size=(num_steps, batch_size,
+            low=-0.1, high=0.1, size=(seq_length, batch_size,
                                       hidden_size)).astype(self.dtype)
         flat_w = np.random.uniform(
             low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
 
         output, last_hidden, last_cell = lstm_naive(input, flat_w)
 
-        init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
-        init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
-        scope = core.Scope()
-        program = fluid.Program()
-        block = program.global_block()
-
-        cache_temp = block.create_var(
-            name="Cache",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW,
-            stop_gradient=True)
+        init_h = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
+        init_c = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
+        state_out = np.ndarray((300)).astype("uint8")
+
         self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
-            'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
-            'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
+            'Input': input,
+            'W': flat_w,
+            'InitH': init_h,
+            'InitC': init_c
         }
-        self.cache_name_list = ['Cache']
         self.attrs = {
-            'max_len': num_steps,
             'dropout_prob': 0.0,
             'is_bidirec': False,
             'input_size': hidden_size,
@@ -168,22 +158,61 @@ class TestCUDNNLstmOp(OpTest):
         }
         self.outputs = {
             'Out': output,
-            "last_h": last_hidden,
-            'last_c': last_cell
+            "LastH": last_hidden,
+            'LastC': last_cell,
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'StateOut': state_out
         }
 
     def test_output_with_place(self):
         # depend on the scope structure
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(
+            place, no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
         # depend on the scope structure
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
             place,
-            set(['Input', 'W', 'InitH', 'InitC']), ['Out', 'last_h', 'last_c'],
-            check_dygraph=False)
+            set(['Input', 'W', 'InitH', 'InitC']), ['Out', 'LastH', 'LastC'],
+            max_relative_error=1e-4)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNlstmAPI(unittest.TestCase):
+    def test_lstm(self):
+        seq_len = 20
+        batch_size = 5
+        hidden_size = 20
+        dropout_prob = 0.0
+        num_layers = 1
+        input = fluid.data(
+            name='input',
+            shape=[seq_len, batch_size, hidden_size],
+            dtype='float64')
+        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
+                                              hidden_size, num_layers,
+                                              dropout_prob)
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        exe.run(fluid.default_startup_program())
+        input_i = np.random.uniform(
+            low=-0.1, high=0.1, size=(seq_len, batch_size,
+                                      hidden_size)).astype("float64")
+        out = exe.run(fluid.default_main_program(),
+                      feed={'input': input_i},
+                      fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
+
+        output, last_hidden, last_cell = lstm_naive(input_i, out[3])
+
+        self.assertTrue(np.allclose(output, out[0], atol=1e-5))
+        self.assertTrue(np.allclose(last_hidden, out[1], atol=1e-5))
+        self.assertTrue(np.allclose(last_cell, out[2], atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_masked_select_op.py b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..259a36e30d9a9c1852ff3800d5240ce7e7bb0e26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+
+
+def np_masked_select(x, mask):
+    result = np.empty(shape=(0), dtype=x.dtype)
+    for ele, ma in zip(np.nditer(x), np.nditer(mask)):
+        if ma:
+            result = np.append(result, ele)
+    return result.flatten()
+
+
+class TestMaskedSelectOp(OpTest):
+    def setUp(self):
+        self.init()
+        self.op_type = "masked_select"
+        x = np.random.random(self.shape).astype("float64")
+        mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
+        out = np_masked_select(x, mask)
+        self.inputs = {'X': x, 'Mask': mask}
+        self.outputs = {'Y': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+    def init(self):
+        self.shape = (50, 3)
+
+
+class TestMaskedSelectOp1(TestMaskedSelectOp):
+    def init(self):
+        self.shape = (6, 8, 9, 18)
+
+
+class TestMaskedSelectOp2(TestMaskedSelectOp):
+    def init(self):
+        self.shape = (168, )
+
+
+class TestMaskedSelectAPI(unittest.TestCase):
+    def test_imperative_mode(self):
+        paddle.disable_static()
+        shape = (88, 6, 8)
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+        x = paddle.to_tensor(np_x)
+        mask = paddle.to_tensor(np_mask)
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(np_x, np_mask)
+        self.assertEqual(np.allclose(out.numpy(), np_out), True)
+        paddle.enable_static()
+
+    def test_static_mode(self):
+        shape = [8, 9, 6]
+        x = paddle.data(shape=shape, dtype='float32', name='x')
+        mask = paddle.data(shape=shape, dtype='bool', name='mask')
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(np_x, np_mask)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+
+        res = exe.run(paddle.static.default_main_program(),
+                      feed={"x": np_x,
+                            "mask": np_mask},
+                      fetch_list=[out])
+        self.assertEqual(np.allclose(res, np_out), True)
+
+
+class TestMaskedSelectError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.data(shape=shape, dtype='float32', name='x')
+            mask = paddle.data(shape=shape, dtype='bool', name='mask')
+            mask_float = paddle.data(
+                shape=shape, dtype='float32', name='mask_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.masked_select(np_x, mask)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_mask_type():
+                paddle.masked_select(x, np_mask)
+
+            self.assertRaises(TypeError, test_mask_type)
+
+            def test_mask_dtype():
+                paddle.masked_select(x, mask_float)
+
+            self.assertRaises(TypeError, test_mask_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index f6eff22d6ce5f06d8853d6244f79b4b07b3fa4f5..00137f63e244a0e166047e89f9ef436da158ed16 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
     @prog_scope()
     def test_integer_div(self):
         a = fluid.layers.data(name="a", shape=[1], dtype='int64')
-        b = a / 7
+        b = a / 2
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
+        a_np = numpy.array([3, 4, 10, 14, 9, 18])
         b_np, = exe.run(fluid.default_main_program(),
                         feed={"a": a_np},
                         fetch_list=[b])
-
-        b_np_actual = (a_np / 7).astype('int64')
+        # for paddle2.0, use true_divide
+        b_np_actual = (a_np / 2.0)
         self.assertTrue(numpy.array_equal(b_np, b_np_actual))
 
     @prog_scope()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 803293be9b7d637875b56b443b04c246737ed2f8..9bb12d546550a821e8a133dd9c91d5d41a50b1b2 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
@@ -284,6 +285,223 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             self.assertEqual((a != b).dtype, fluid.core.VarDesc.VarType.BOOL)
             self.assertTrue(np.array_equal((a != b).numpy(), a_np != b_np))
 
+    def test_tensor_patch_method(self):
+        paddle.disable_static()
+        x_np = np.random.uniform(-1, 1, [2, 3]).astype(self.dtype)
+        y_np = np.random.uniform(-1, 1, [2, 3]).astype(self.dtype)
+        z_np = np.random.uniform(-1, 1, [6, 9]).astype(self.dtype)
+
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        z = paddle.to_tensor(z_np)
+
+        a = paddle.to_tensor([[1, 1], [2, 2], [3, 3]])
+        b = paddle.to_tensor([[1, 1], [2, 2], [3, 3]])
+
+        # 1. Unary operation for Tensor
+        self.assertEqual(x.dim(), 2)
+        self.assertEqual(x.ndimension(), 2)
+        self.assertEqual(x.ndim, 2)
+        self.assertEqual(x.size(), [2, 3])
+        self.assertTrue(
+            np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.logsigmoid().numpy(),
+                           fluid.layers.logsigmoid(x).numpy()))
+        self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.tanh_shrink().numpy(),
+                           fluid.layers.tanh_shrink(x).numpy()))
+        self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
+        m = x.abs()
+        self.assertTrue(
+            np.array_equal(m.sqrt().numpy(), paddle.sqrt(m).numpy()))
+        self.assertTrue(
+            np.array_equal(m.rsqrt().numpy(), paddle.rsqrt(m).numpy()))
+        self.assertTrue(
+            np.array_equal(x.ceil().numpy(), paddle.ceil(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.floor().numpy(), paddle.floor(x).numpy()))
+        self.assertTrue(np.array_equal(x.cos().numpy(), paddle.cos(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.acos().numpy(), paddle.acos(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.asin().numpy(), paddle.asin(x).numpy()))
+        self.assertTrue(np.array_equal(x.sin().numpy(), paddle.sin(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.sinh().numpy(), paddle.sinh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.cosh().numpy(), paddle.cosh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.round().numpy(), paddle.round(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.reciprocal().numpy(), paddle.reciprocal(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.softplus().numpy(),
+                           fluid.layers.softplus(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.softsign().numpy(),
+                           fluid.layers.softsign(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
+        m = paddle.to_tensor(np.random.uniform(1, 2, [3, 3]), 'float32')
+        m = m.matmul(m.t())
+        self.assertTrue(
+            np.array_equal(m.cholesky().numpy(), paddle.cholesky(m).numpy()))
+
+        self.assertTrue(
+            np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.isfinite().numpy(), paddle.isfinite(x).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.cast('int32').numpy(), paddle.cast(x, 'int32').numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.expand([3, 2, 3]).numpy(),
+                paddle.expand(x, [3, 2, 3]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.tile([2, 2]).numpy(), paddle.tile(x, [2, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.flatten().numpy(), paddle.flatten(x).numpy()))
+        index = paddle.to_tensor([0, 1])
+        self.assertTrue(
+            np.array_equal(
+                x.gather(index).numpy(), paddle.gather(x, index).numpy()))
+        index = paddle.to_tensor([[0, 1], [1, 2]])
+        self.assertTrue(
+            np.array_equal(
+                x.gather_nd(index).numpy(), paddle.gather_nd(x, index).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.reverse([0, 1]).numpy(), paddle.reverse(x, [0, 1]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                a.reshape([3, 2]).numpy(), paddle.reshape(a, [3, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.slice([0, 1], [0, 0], [1, 2]).numpy(),
+                paddle.slice(x, [0, 1], [0, 0], [1, 2]).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.split(2)[0].numpy(), paddle.split(x, 2)[0].numpy()))
+        m = paddle.to_tensor(
+            np.random.uniform(-1, 1, [1, 6, 1, 1]).astype(self.dtype))
+        self.assertTrue(
+            np.array_equal(
+                m.squeeze([]).numpy(), paddle.squeeze(m, []).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                m.squeeze([1, 2]).numpy(), paddle.squeeze(m, [1, 2]).numpy()))
+        m = paddle.to_tensor([2, 3, 3, 1, 5, 3], 'float32')
+        self.assertTrue(
+            np.array_equal(m.unique()[0].numpy(), paddle.unique(m)[0].numpy()))
+        self.assertTrue(
+            np.array_equal(m.unique_with_counts()[2],
+                           paddle.unique_with_counts(m)[2]))
+        self.assertTrue(np.array_equal(x.flip([0]), paddle.flip(x, [0])))
+        self.assertTrue(np.array_equal(x.unbind(0), paddle.unbind(x, 0)))
+        self.assertTrue(np.array_equal(x.roll(1), paddle.roll(x, 1)))
+        self.assertTrue(np.array_equal(x.cumsum(1), paddle.cumsum(x, 1)))
+        m = paddle.to_tensor(1)
+        self.assertTrue(np.array_equal(m.increment(), paddle.increment(m)))
+        m = x.abs()
+        self.assertTrue(np.array_equal(m.log(), paddle.log(m)))
+        self.assertTrue(np.array_equal(x.pow(2), paddle.pow(x, 2)))
+        self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x)))
+
+        # 2. Binary operation
+        self.assertTrue(
+            np.array_equal(
+                x.matmul(y, True, False).numpy(),
+                paddle.matmul(x, y, True, False).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.norm(
+                    p='fro', axis=[0, 1]).numpy(),
+                paddle.norm(
+                    x, p='fro', axis=[0, 1]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.dist(y).numpy(), paddle.dist(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(x.cross(y).numpy(), paddle.cross(x, y).numpy()))
+        m = x.expand([2, 2, 3])
+        n = y.expand([2, 2, 3]).transpose([0, 2, 1])
+        self.assertTrue(
+            np.array_equal(m.bmm(n).numpy(), paddle.bmm(m, n).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.histogram(5, -1, 1).numpy(),
+                paddle.histogram(x, 5, -1, 1).numpy()))
+        self.assertTrue(
+            np.array_equal(x.equal(y).numpy(), paddle.equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.greater_equal(y).numpy(), paddle.greater_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.greater_than(y).numpy(), paddle.greater_than(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.less_equal(y).numpy(), paddle.less_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.less_than(y).numpy(), paddle.less_than(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.not_equal(y).numpy(), paddle.not_equal(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.equal_all(y).numpy(), paddle.equal_all(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.allclose(y).numpy(), paddle.allclose(x, y).numpy()))
+        m = x.expand([2, 2, 3])
+        self.assertTrue(
+            np.array_equal(
+                x.expand_as(m).numpy(), paddle.expand_as(x, m).numpy()))
+        index = paddle.to_tensor([2, 1, 0])
+        self.assertTrue(
+            np.array_equal(
+                a.scatter(index, b).numpy(),
+                paddle.scatter(a, index, b).numpy()))
+
+        # 3. Bool tensor operation
+        x = paddle.to_tensor([[True, False], [True, False]])
+        y = paddle.to_tensor([[False, False], [False, True]])
+        self.assertTrue(
+            np.array_equal(x.reduce_all().numpy(), paddle.reduce_all(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(x.reduce_any().numpy(), paddle.reduce_any(x).numpy(
+            )))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_not(y).numpy(), paddle.logical_not(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_or(y).numpy(), paddle.logical_or(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_xor(y).numpy(), paddle.logical_xor(x, y).numpy()))
+        self.assertTrue(
+            np.array_equal(
+                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..884139a23d51c95c79439b91d501dc935baeae36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -0,0 +1,336 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+class TestMatMulV2Op(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+    def setUp(self):
+        self.config()
+        self.op_type = "matmul_v2"
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+        self.dtype = "float64"
+
+
+class TestMatMuklOp3(TestMatMulV2Op):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp5(TestMatMulV2Op):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 100, 2)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 1, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp8(TestMatMulV2Op):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 1, 100)
+        self.y_shape = (2, 1, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+        self.dtype = "float64"
+
+
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100, 2)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp13(TestMatMulV2Op):
+    """
+    case 13
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 100, 2)
+        self.y_shape = (2, 2, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 1, 100, 2)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.trans_x = True
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 1, 2, 100)
+        self.y_shape = (1, 2, 2, 100, 1)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 1)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMuklOp17(TestMatMulV2Op):
+    """
+    case 17 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.trans_x = False
+        self.trans_y = False
+        self.dtype = "float64"
+
+
+class TestMatMulV2API(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32")
+            input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32")
+
+            result = paddle.matmul(input_x, input_y)
+
+            x_np = np.random.random([4, 3]).astype("float32")
+            y_np = np.random.random([3, 4]).astype("float32")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input_x": x_np,
+                                    "input_y": y_np},
+                              fetch_list=[result])
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float64")
+                input_y = np.random.random([3, 4]).astype("float64")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9afc4bec66f2927a674ac15e807fe01f724c64f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMaxTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+    def test_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="float32")
+            result_max = paddle.max(x=data, axis=1)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.rand(10, 10).astype(np.float32)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
+        self.assertEqual((res == np.max(input_data, axis=1)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_max = paddle.max(x=data, axis=0)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
+        self.assertEqual((res == np.max(input_data, axis=0)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_max = paddle.max(x=data, axis=(0, 1))
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
+        self.assertEqual((res == np.max(input_data, axis=(0, 1))).all(), True)
+
+    def test_errors(self):
+        paddle.enable_static()
+
+        def test_input_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = np.random.rand(10, 10)
+                result_max = paddle.max(x=data, axis=0)
+
+        self.assertRaises(TypeError, test_input_type)
+
+        def test_axis_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+                axis = paddle.static.data("axis", shape=[10, 10], dtype="int64")
+                result_min = paddle.min(data, axis)
+
+        self.assertRaises(TypeError, test_axis_type)
+
+    def test_imperative_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(np_x)
+        z = paddle.max(x, axis=0)
+        np_z = z.numpy()
+        z_expected = np.array(np.max(np_x, axis=0))
+        self.assertEqual((np_z == z_expected).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5645597007a00cac9c75ec1ae90bc00a5bc75f22
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMaximumTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+        self.input_x = np.random.rand(10, 15).astype("float32")
+        self.input_y = np.random.rand(10, 15).astype("float32")
+        self.input_z = np.random.rand(15).astype("float32")
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
+            result_max = paddle.maximum(data_x, data_y)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "y": self.input_y},
+                           fetch_list=[result_max])
+        self.assertEqual((res == np.maximum(self.input_x, self.input_y)).all(),
+                         True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_z = paddle.static.data("z", shape=[15], dtype="float32")
+            result_max = paddle.maximum(data_x, data_z, axis=1)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "z": self.input_z},
+                           fetch_list=[result_max])
+        self.assertEqual((res == np.maximum(self.input_x, self.input_z)).all(),
+                         True)
+
+    def test_dynamic_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        z = paddle.maximum(x, y)
+        np_z = z.numpy()
+        z_expected = np.array(np.maximum(self.input_x, self.input_y))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_broadcast_axis(self):
+        paddle.disable_static()
+        np_x = np.random.rand(5, 4, 3, 2).astype("float64")
+        np_y = np.random.rand(4, 3).astype("float64")
+
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        result_1 = paddle.maximum(x, y, axis=1)
+        result_2 = paddle.maximum(x, y, axis=-2)
+        self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index f3abd1acce6fafb8d187bfbe82765f982acae010..29e79b096cf790858e8e07aedc5c6b76881e8f82 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -17,10 +17,13 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
+np.random.seed(10)
+
 
 class TestMeanOp(OpTest):
     def setUp(self):
@@ -73,5 +76,182 @@ class TestFP16MeanOp(TestMeanOp):
                 place, ['X'], 'Out', max_relative_error=0.8)
 
 
+def ref_reduce_mean(x, axis=None, keepdim=False, reduce_all=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    if reduce_all:
+        axis = None
+    return np.mean(x, axis=axis, keepdims=keepdim)
+
+
+class TestReduceMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = 'reduce_mean'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+        self.axis = [0]
+        self.keepdim = False
+        self.reduce_all = False
+        self.set_attrs()
+
+        np.random.seed(10)
+        x_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out_np = ref_reduce_mean(x_np, self.axis, self.keepdim, self.reduce_all)
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np}
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keepdim,
+            'reduce_all': self.reduce_all
+        }
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+class TestReduceMeanOpDefaultAttrs(TestReduceMeanOp):
+    def setUp(self):
+        self.op_type = 'reduce_mean'
+        self.dtype = 'float64'
+        self.shape = [2, 3, 4, 5]
+
+        x_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out_np = np.mean(x_np, axis=0)
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np}
+
+
+class TestReduceMeanOpFloat32(TestReduceMeanOp):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestReduceMeanOpShape1D(TestReduceMeanOp):
+    def set_attrs(self):
+        self.shape = [100]
+
+
+class TestReduceMeanOpShape6D(TestReduceMeanOp):
+    def set_attrs(self):
+        self.shape = [2, 3, 4, 5, 6, 7]
+
+
+class TestReduceMeanOpAxisAll(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+
+
+class TestReduceMeanOpAxisTuple(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = (0, 1, 2)
+
+
+class TestReduceMeanOpAxisNegative(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [-2, -1]
+
+
+class TestReduceMeanOpKeepdimTrue1(TestReduceMeanOp):
+    def set_attrs(self):
+        self.keepdim = True
+
+
+class TestReduceMeanOpKeepdimTrue2(TestReduceMeanOp):
+    def set_attrs(self):
+        self.axis = [0, 1, 2, 3]
+        self.keepdim = True
+
+
+class TestReduceMeanOpReduceAllTrue(TestReduceMeanOp):
+    def set_attrs(self):
+        self.reduce_all = True
+
+
+class TestMeanAPI(unittest.TestCase):
+    # test paddle.tensor.stat.mean
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_shape)
+            out1 = paddle.mean(x)
+            out2 = paddle.tensor.mean(x)
+            out3 = paddle.tensor.stat.mean(x)
+            axis = np.arange(len(self.x_shape)).tolist()
+            out4 = paddle.mean(x, axis)
+            out5 = paddle.mean(x, tuple(axis))
+
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x},
+                          fetch_list=[out1, out2, out3, out4, out5])
+        out_ref = np.mean(self.x)
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, axis=None, keepdim=False):
+            x_tensor = paddle.to_variable(x)
+            out = paddle.mean(x_tensor, axis, keepdim)
+            if isinstance(axis, list):
+                axis = tuple(axis)
+                if len(axis) == 0:
+                    axis = None
+            out_ref = np.mean(x, axis, keepdims=keepdim)
+            self.assertEqual(
+                np.allclose(
+                    out.numpy(), out_ref, rtol=1e-04), True)
+
+        test_case(self.x)
+        test_case(self.x, [])
+        test_case(self.x, -1)
+        test_case(self.x, keepdim=True)
+        test_case(self.x, 2, keepdim=True)
+        test_case(self.x, [0, 2])
+        test_case(self.x, (0, 2))
+        test_case(self.x, [0, 1, 2, 3])
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data("x", shape=[10, 10], dtype="float32")
+            out = fluid.layers.reduce_mean(input=x, dim=1)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            x_np = np.random.rand(10, 10).astype(np.float32)
+            res = exe.run(feed={"x": x_np}, fetch_list=[out])
+        self.assertEqual(np.allclose(res[0], np.mean(x_np, axis=1)), True)
+
+        with fluid.dygraph.guard():
+            x_np = np.random.rand(10, 10).astype(np.float32)
+            x = fluid.dygraph.to_variable(x_np)
+            out = fluid.layers.reduce_mean(input=x, dim=1)
+        self.assertEqual(np.allclose(out.numpy(), np.mean(x_np, axis=1)), True)
+
+    def test_errors(self):
+        paddle.disable_static()
+        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        x = paddle.to_tensor(x)
+        self.assertRaises(Exception, paddle.mean, x, -3)
+        self.assertRaises(Exception, paddle.mean, x, 2)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12], 'int32')
+            self.assertRaises(TypeError, paddle.mean, x)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_metrics.py b/python/paddle/fluid/tests/unittests/test_metrics.py
deleted file mode 100644
index ec27884cae2b0462951f6597b1b83e58d1c8af5d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_metrics.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import Program, program_guard
-
-
-class TestMetricsDetectionMap(unittest.TestCase):
-    def test_detection_map(self):
-        program = fluid.Program()
-        with program_guard(program):
-            detect_res = fluid.layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = fluid.layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            box = fluid.layers.data(
-                name='bbox',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            map_eval = fluid.metrics.DetectionMAP(
-                detect_res, label, box, class_num=21)
-            cur_map, accm_map = map_eval.get_map_var()
-            self.assertIsNotNone(cur_map)
-            self.assertIsNotNone(accm_map)
-        print(str(program))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9eff05c5ea9fb585421b6f99bf55b3bb95bf9ff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMinTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+    def test_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="float32")
+            result_min = paddle.min(x=data, axis=1)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.rand(10, 10).astype(np.float32)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
+        self.assertEqual((res == np.min(input_data, axis=1)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_min = paddle.min(x=data, axis=0)
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
+        self.assertEqual((res == np.min(input_data, axis=0)).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+            result_min = paddle.min(x=data, axis=(0, 1))
+            exe = paddle.static.Executor(self.place)
+            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
+            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
+        self.assertEqual((res == np.min(input_data, axis=(0, 1))).all(), True)
+
+    def test_errors(self):
+        paddle.enable_static()
+
+        def test_input_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = np.random.rand(10, 10)
+                result_min = paddle.min(x=data, axis=0)
+
+        self.assertRaises(TypeError, test_input_type)
+
+        def test_axis_type():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data = paddle.static.data("data", shape=[10, 10], dtype="int64")
+                axis = paddle.static.data("axis", shape=[10, 10], dtype="int64")
+                result_min = paddle.min(data, axis)
+
+        self.assertRaises(TypeError, test_axis_type)
+
+    def test_imperative_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(np_x)
+        z = paddle.min(x, axis=0)
+        np_z = z.numpy()
+        z_expected = np.array(np.min(np_x, axis=0))
+        self.assertEqual((np_z == z_expected).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c08b7386ca2c5da04c0a289872dacf68a2ea040
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+
+
+class ApiMinimumTest(unittest.TestCase):
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+        self.input_x = np.random.rand(10, 15).astype("float32")
+        self.input_y = np.random.rand(10, 15).astype("float32")
+        self.input_z = np.random.rand(15).astype("float32")
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
+            result_min = paddle.minimum(data_x, data_y)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "y": self.input_y},
+                           fetch_list=[result_min])
+        self.assertEqual((res == np.minimum(self.input_x, self.input_y)).all(),
+                         True)
+
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data_x = paddle.static.data("x", shape=[10, 15], dtype="float32")
+            data_z = paddle.static.data("z", shape=[15], dtype="float32")
+            result_min = paddle.minimum(data_x, data_z, axis=1)
+            exe = paddle.static.Executor(self.place)
+            res, = exe.run(feed={"x": self.input_x,
+                                 "z": self.input_z},
+                           fetch_list=[result_min])
+        self.assertEqual((res == np.minimum(self.input_x, self.input_z)).all(),
+                         True)
+
+    def test_dynamic_api(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        z = paddle.minimum(x, y)
+        np_z = z.numpy()
+        z_expected = np.array(np.minimum(self.input_x, self.input_y))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_broadcast_axis(self):
+        paddle.disable_static()
+        np_x = np.random.rand(5, 4, 3, 2).astype("float64")
+        np_y = np.random.rand(4, 3).astype("float64")
+
+        x = paddle.to_variable(self.input_x)
+        y = paddle.to_variable(self.input_y)
+        result_1 = paddle.minimum(x, y, axis=1)
+        result_2 = paddle.minimum(x, y, axis=-2)
+        self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index 39601eb0e12ff8d4debdda414c55fb43a9b41d79..f6207edb41c190ac51dfe67dad22bb0191a67a07 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -16,6 +16,7 @@ TestCases for Monitor
 """
 
 from __future__ import print_function
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
+            "InMemoryDataset")
         dataset.set_batch_size(32)
         dataset.set_thread(3)
         dataset.set_filelist([
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index 89052396cf94615aab0841090430509c38b8423f..753d96c44114a552f4bdd299602d7f13f672efbf 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -69,6 +69,7 @@ class TestNNMseLoss(unittest.TestCase):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
             label_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -106,6 +107,7 @@ class TestNNMseLoss(unittest.TestCase):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
             label_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -143,6 +145,7 @@ class TestNNMseLoss(unittest.TestCase):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
             label_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -177,5 +180,112 @@ class TestNNMseLoss(unittest.TestCase):
             self.assertTrue(dy_result.shape, [1])
 
 
+class TestNNFunctionalMseLoss(unittest.TestCase):
+    def test_NNFunctionalMseLoss_mean(self):
+        for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
+            input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            target_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                input = paddle.data(name='input', shape=dim, dtype='float32')
+                target = paddle.data(name='target', shape=dim, dtype='float32')
+                mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+            static_result = exe.run(
+                prog,
+                feed={"input": input_np,
+                      "target": target_np},
+                fetch_list=[mse_loss])
+
+            paddle.disable_static()
+            dy_ret = paddle.nn.functional.mse_loss(
+                paddle.to_variable(input_np),
+                paddle.to_variable(target_np), 'mean')
+            dy_result = dy_ret.numpy()
+
+            sub = input_np - target_np
+            expected = np.mean(sub * sub)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            self.assertTrue(dy_result.shape, [1])
+
+    def test_NNFunctionalMseLoss_sum(self):
+        for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
+            input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            target_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                input = paddle.data(name='input', shape=dim, dtype='float32')
+                target = paddle.data(name='target', shape=dim, dtype='float32')
+                mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+                static_result = exe.run(
+                    prog,
+                    feed={"input": input_np,
+                          "target": target_np},
+                    fetch_list=[mse_loss])
+
+            paddle.disable_static()
+            dy_ret = paddle.nn.functional.mse_loss(
+                paddle.to_variable(input_np),
+                paddle.to_variable(target_np), 'sum')
+            dy_result = dy_ret.numpy()
+
+            sub = input_np - target_np
+            expected = np.sum(sub * sub)
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            self.assertTrue(dy_result.shape, [1])
+
+    def test_NNFunctionalMseLoss_none(self):
+        for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
+            input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            target_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                input = paddle.data(name='input', shape=dim, dtype='float32')
+                target = paddle.data(name='target', shape=dim, dtype='float32')
+                mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+                static_result = exe.run(
+                    prog,
+                    feed={"input": input_np,
+                          "target": target_np},
+                    fetch_list=[mse_loss])
+
+            paddle.disable_static()
+            dy_ret = paddle.nn.functional.mse_loss(
+                paddle.to_variable(input_np),
+                paddle.to_variable(target_np), 'none')
+            dy_result = dy_ret.numpy()
+
+            sub = input_np - target_np
+            expected = sub * sub
+            self.assertTrue(np.allclose(static_result, expected))
+            self.assertTrue(np.allclose(static_result, dy_result))
+            self.assertTrue(np.allclose(dy_result, expected))
+            self.assertTrue(dy_result.shape, [1])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 8ca06aa952184daec6be59a09330c8f16f6ee1d6..5f223de1954f7b401ac031265cca8c2e661c7392 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -175,5 +175,57 @@ class TestFP16MulOp2(TestMulOp2):
                 no_grad_set=set('Y'))
 
 
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not core.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
old mode 100644
new mode 100755
index 64421f6a1c6a018fdf82a7518f647099830972b3..dbf167617a24f36a36aff52d996a50ca3ebb6672
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -26,8 +26,10 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_static_graph_case(self, x_data, y_data, axis=-1):
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = paddle.nn.data(name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = paddle.static.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = paddle.static.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
             res = tensor.multiply(x, y, axis=axis)
 
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -41,9 +43,9 @@ class TestMultiplyAPI(unittest.TestCase):
             return res
 
     def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
-        paddle.enable_imperative()
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
+        paddle.disable_static()
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         res = paddle.multiply(x, y, axis=axis)
         return res.numpy()
 
@@ -107,34 +109,48 @@ class TestMultiplyError(unittest.TestCase):
     def test_errors(self):
         """test_errors."""
         # test static computation graph: dtype can not be int8
-        paddle.disable_imperative()
+        paddle.enable_static()
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=[100], dtype=np.int8)
-            y = paddle.nn.data(name='y', shape=[100], dtype=np.int8)
+            x = paddle.static.data(name='x', shape=[100], dtype=np.int8)
+            y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
             self.assertRaises(TypeError, tensor.multiply, x, y)
 
         # test static computation graph: inputs must be broadcastable 
         with program_guard(Program(), Program()):
-            x = paddle.nn.data(name='x', shape=[20, 50], dtype=np.float64)
-            y = paddle.nn.data(name='y', shape=[20], dtype=np.float64)
+            x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
+            y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
             self.assertRaises(fluid.core.EnforceNotMet, tensor.multiply, x, y)
 
         np.random.seed(7)
         # test dynamic computation graph: dtype can not be int8
-        paddle.enable_imperative()
+        paddle.disable_static()
         x_data = np.random.randn(200).astype(np.int8)
         y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         x_data = np.random.rand(200, 5)
         y_data = np.random.rand(200)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
         self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
 
+        # test dynamic computation graph: inputs must be broadcastable(python)
+        x_data = np.random.rand(200, 5)
+        y_data = np.random.rand(200)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+        # test dynamic computation graph: dtype must be same
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index f3b15835b9e6f2797a2c76758d0b42db3d50ff27..3a8867f6bd29f5bc0e512f9c8b22ecf192253fc7 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -24,7 +24,7 @@ import numpy as np
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.io import Dataset, BatchSampler, DataLoader
+from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
 
@@ -108,6 +108,48 @@ class TestDataLoaderAssert(unittest.TestCase):
                 self.assertTrue(False)
 
 
+class TestDatasetRuntimeError(unittest.TestCase):
+    def test_main(self):
+        dataset = Dataset()
+
+        # __getitem__ not implement
+        try:
+            d = dataset[0]
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        # __len__ not implement
+        try:
+            l = len(dataset)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        dataset = IterableDataset()
+
+        # __iter__ not implement
+        try:
+            d = iter(dataset)
+            self.assertTrue(False)
+        except NotImplementedError:
+            pass
+
+        # __getitem__ runtime error
+        try:
+            d = dataset[0]
+            self.assertTrue(False)
+        except RuntimeError:
+            pass
+
+        # __len__ runtime error
+        try:
+            l = len(dataset)
+            self.assertTrue(False)
+        except RuntimeError:
+            pass
+
+
 # CI Converage cannot record stub in subprocess,
 # HACK a _worker_loop in main process call here
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -144,12 +186,15 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
                 loader._worker_loop(
-                    loader._dataset, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0)
+                    loader._dataset, 0, indices_queue, loader._data_queue,
+                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
                 self.assertTrue(False)
         except AssertionError:
             pass
-        except Exception:
+        except Exception as e:
+            print("Exception", e)
+            import sys
+            sys.stdout.flush()
             self.assertTrue(False)
 
     def run_with_worker_done(self, use_shared_memory=True):
@@ -184,8 +229,8 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                 indices_queue.put(None)
                 loader._workers_done_event.set()
                 loader._worker_loop(
-                    loader._dataset, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0)
+                    loader._dataset, 0, indices_queue, loader._data_queue,
+                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
                 self.assertTrue(True)
         except AssertionError:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0209406fdff1d4f7659b15d5e6bd8af74fd0f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import sys
+import six
+import time
+import unittest
+import multiprocessing
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import Dataset, BatchSampler, DataLoader
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.base import to_variable
+
+from test_multiprocess_dataloader_iterable_dataset_static import RandomDataset, prepare_places
+from test_multiprocess_dataloader_iterable_dataset_static import EPOCH_NUM, BATCH_SIZE, IMAGE_SIZE, SAMPLE_NUM, CLASS_NUM
+
+
+class SimpleFCNet(fluid.dygraph.Layer):
+    def __init__(self):
+        super(SimpleFCNet, self).__init__()
+
+        param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.8))
+        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5))
+        self._fcs = []
+        in_channel = IMAGE_SIZE
+        for hidden_size in [10, 20, 30]:
+            self._fcs.append(
+                Linear(
+                    in_channel,
+                    hidden_size,
+                    act='tanh',
+                    param_attr=param_attr,
+                    bias_attr=bias_attr))
+            in_channel = hidden_size
+        self._fcs.append(
+            Linear(
+                in_channel,
+                CLASS_NUM,
+                act='softmax',
+                param_attr=param_attr,
+                bias_attr=bias_attr))
+
+    def forward(self, image):
+        out = image
+        for fc in self._fcs:
+            out = fc(out)
+        return out
+
+
+class TestDygraphDataLoader(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+        with fluid.dygraph.guard(places[0]):
+            fc_net = SimpleFCNet()
+            optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
+
+            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                places=places,
+                num_workers=num_workers,
+                batch_size=BATCH_SIZE,
+                drop_last=True)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for _ in six.moves.range(EPOCH_NUM):
+                step = 0
+                for image, label in dataloader():
+                    out = fc_net(image)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    fc_net.clear_gradients()
+
+                    loss_list.append(np.mean(avg_loss.numpy()))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+    def test_main(self):
+        # dynamic graph do not run with_data_parallel
+        for p in prepare_places(False):
+            results = []
+            for num_workers in [0, 2]:
+                print(self.__class__.__name__, p, num_workers)
+                sys.stdout.flush()
+                ret = self.run_main(num_workers=num_workers, places=p)
+                results.append(ret)
+            assert results[0]['loss'].shape[0] * 2 == results[1]['loss'].shape[
+                0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..562051335850a5b665580981d2c41a20c8fe7575
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import math
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import IterableDataset, BatchSampler, DataLoader, get_worker_info
+
+
+class RangeIterableDatasetSplit(IterableDataset):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __iter__(self):
+        worker_info = get_worker_info()
+        if worker_info is None:
+            iter_start = self.start
+            iter_end = self.end
+        else:
+            per_worker = int(
+                math.ceil((self.end - self.start) / float(
+                    worker_info.num_workers)))
+            worker_id = worker_info.id
+            iter_start = self.start + worker_id * per_worker
+            iter_end = min(iter_start + per_worker, self.end)
+
+        for i in range(iter_start, iter_end):
+            yield np.array([i])
+
+
+class TestDynamicDataLoaderIterSplit(unittest.TestCase):
+    def test_main(self):
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            dataset = RangeIterableDatasetSplit(0, 10)
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=2,
+                batch_size=1,
+                drop_last=True)
+
+            rets = []
+            for d in dataloader:
+                rets.append(d[0].numpy()[0][0])
+
+            assert tuple(sorted(rets)) == tuple(range(0, 10))
+
+
+class RangeIterableDataset(IterableDataset):
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __iter__(self):
+        for i in range(self.start, self.end):
+            yield np.array([i])
+
+
+class TestDynamicDataLoaderIterInitFuncSplit(unittest.TestCase):
+    def test_main(self):
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            dataset = RangeIterableDataset(0, 10)
+
+            def worker_spliter(worker_id):
+                worker_info = get_worker_info()
+
+                dataset = worker_info.dataset
+                start = dataset.start
+                end = dataset.end
+                num_per_worker = int(
+                    math.ceil((end - start) / float(worker_info.num_workers)))
+
+                worker_id = worker_info.id
+                dataset.start = start + worker_id * num_per_worker
+                dataset.end = min(dataset.start + num_per_worker, end)
+
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=1,
+                batch_size=1,
+                drop_last=True,
+                worker_init_fn=worker_spliter)
+
+            rets = []
+            for d in dataloader:
+                rets.append(d[0].numpy()[0][0])
+
+            assert tuple(sorted(rets)) == tuple(range(0, 10))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64e11d156ec74a375c161926ce3671e83f2352a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import sys
+import six
+import time
+import unittest
+import multiprocessing
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.io import IterableDataset, BatchSampler, DataLoader, get_worker_info
+
+EPOCH_NUM = 2
+BATCH_SIZE = 8
+IMAGE_SIZE = 32
+SAMPLE_NUM = 80
+CLASS_NUM = 10
+
+
+class RandomDataset(IterableDataset):
+    def __init__(self, sample_num, class_num):
+        self.sample_num = sample_num
+        self.class_num = class_num
+
+    def __iter__(self):
+        for i in range(self.sample_num):
+            np.random.seed(i)
+            image = np.random.random([IMAGE_SIZE]).astype('float32')
+            label = np.random.randint(0, self.class_num - 1,
+                                      (1, )).astype('int64')
+            yield image, label
+
+
+def simple_fc_net_static():
+    startup_prog = fluid.Program()
+    main_prog = fluid.Program()
+    startup_prog.random_seed = 1
+    main_prog.random_seed = 1
+
+    with fluid.unique_name.guard():
+        with fluid.program_guard(main_prog, startup_prog):
+            image = fluid.data(
+                name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = image
+            param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.8))
+            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.5))
+            for hidden_size in [10, 20, 30]:
+                hidden = fluid.layers.fc(hidden,
+                                         size=hidden_size,
+                                         act='tanh',
+                                         param_attr=param_attr,
+                                         bias_attr=bias_attr)
+
+            predict_label = fluid.layers.fc(hidden,
+                                            size=CLASS_NUM,
+                                            act='softmax',
+                                            param_attr=param_attr,
+                                            bias_attr=bias_attr)
+            loss = fluid.layers.reduce_mean(
+                fluid.layers.cross_entropy(
+                    input=predict_label, label=label))
+
+            optimizer = fluid.optimizer.Adam()
+            optimizer.minimize(loss)
+    return startup_prog, main_prog, image, label, loss
+
+
+def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
+    places = []
+    if with_cpu:
+        places.append([fluid.CPUPlace()])
+        if with_data_parallel:
+            places.append([fluid.CPUPlace()] * 2)
+
+    if with_gpu and fluid.core.is_compiled_with_cuda():
+        tmp = fluid.cuda_places()[:2]
+        assert len(tmp) > 0, "no gpu detected"
+        if with_data_parallel:
+            places.append(tmp)
+        places.append([tmp[0]])
+    return places
+
+
+class TestStaticDataLoader(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            startup_prog, main_prog, image, label, loss = simple_fc_net_static()
+
+            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                feed_list=[image, label],
+                places=places,
+                num_workers=num_workers,
+                batch_size=BATCH_SIZE,
+                drop_last=True)
+            # assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
+
+            exe = fluid.Executor(place=places[0])
+            exe.run(startup_prog)
+
+            prog = fluid.CompiledProgram(main_prog)
+            if len(places) > 1:
+                prog = prog.with_data_parallel(
+                    loss_name=loss.name, places=places)
+
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            for i in six.moves.range(EPOCH_NUM):
+                step = 0
+                for d in dataloader:
+                    assert len(d) == len(places), "{} != {}".format(
+                        len(d), len(places))
+                    for i, item in enumerate(d):
+                        image = item['image']
+                        label = item['label']
+                        assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
+                        assert label.shape() == [BATCH_SIZE, 1]
+                        assert image._place()._equals(places[i])
+                        assert label._place()._equals(places[i])
+                    L, = exe.run(program=prog,
+                                 feed=d,
+                                 fetch_list=[loss],
+                                 use_program_cache=True)
+                    loss_list.append(np.mean(L))
+                    step += 1
+                step_list.append(step)
+
+        end_t = time.time()
+        ret = {
+            "time": end_t - start_t,
+            "step": step_list,
+            "loss": np.array(loss_list)
+        }
+        print("time cost", ret['time'], 'step_list', ret['step'])
+        return ret
+
+    def test_main(self):
+        for p in prepare_places(True):
+            results = []
+            for num_workers in [0, 2]:
+                print(self.__class__.__name__, p, num_workers)
+                sys.stdout.flush()
+                ret = self.run_main(num_workers=num_workers, places=p)
+                results.append(ret)
+            assert results[0]['loss'].shape[0] * 2 == results[1]['loss'].shape[
+                0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index e5f44403a91f5167996359a233aee37bf622db9d..38497f91fc18847e40efa691a65c2a7adc20e51c 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -137,14 +137,8 @@ class TestStaticDataLoader(unittest.TestCase):
                         label = item['label']
                         assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
                         assert label.shape() == [BATCH_SIZE, 1]
-                        if places[i]._equals(fluid.CPUPlace()):
-                            assert image._place()._equals(fluid.CPUPlace())
-                            assert label._place()._equals(fluid.CPUPlace())
-                        else:
-                            assert image._place()._equals(fluid.CUDAPinnedPlace(
-                            ))
-                            assert label._place()._equals(fluid.CUDAPinnedPlace(
-                            ))
+                        assert image._place()._equals(places[i])
+                        assert label._place()._equals(places[i])
                     L, = exe.run(program=prog,
                                  feed=d,
                                  fetch_list=[loss],
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index b14e3a15d979c6f66c2ffeeeec6536d5a8ab3b47..e7154193beaf788a9d20f3c131b1df3420918266 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -445,7 +445,6 @@ class TestNLLLoss(unittest.TestCase):
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
                 name='input', shape=[5, 3, 5, 5], dtype='float64')
@@ -879,5 +878,93 @@ class TestNLLLossOp2DNoReduce(OpTest):
         self.label_shape = [5, 5, 5]
 
 
+class TestNLLLossName(unittest.TestCase):
+    def test_name(self):
+        prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        place = paddle.CPUPlace()
+        with paddle.static.program_guard(prog, startup_prog):
+            x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+            label = paddle.data(name='label', shape=[10], dtype='int64')
+            nll_loss = paddle.nn.loss.NLLLoss(name='nll_loss')
+            res = nll_loss(x, label)
+            self.assertTrue(res.name.startswith('nll_loss'))
+
+
+class TestNLLLossInvalidArgs(unittest.TestCase):
+    def test_x_dim_value_error(self):
+        def test_x_dim_lt_2():
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                x = paddle.data(name='x', shape=[10, ], dtype='float64')
+                label = paddle.data(name='label', shape=[10, ], dtype='float64')
+                nll_loss = paddle.nn.loss.NLLLoss()
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError, test_x_dim_lt_2)
+
+        def test_x_dim_imperative_lt_2():
+            with fluid.dygraph.guard():
+                x_np = np.random.random(size=(5, )).astype(np.float64)
+                label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64)
+                x = paddle.to_variable(x_np)
+                label = paddle.to_variable(label_np)
+                nll_loss = paddle.nn.loss.NLLLoss()
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError, test_x_dim_imperative_lt_2)
+
+    def test_reduction_value_error(self):
+        def test_NLLLoss_reduction_not_sum_mean_none():
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                label = paddle.data(name='label', shape=[10], dtype='int64')
+                nll_loss = paddle.nn.loss.NLLLoss(reduction='')
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError, test_NLLLoss_reduction_not_sum_mean_none)
+
+        def test_NLLLoss_reduction_imperative_not_sum_mean_none():
+            with fluid.dygraph.guard():
+                x_np = np.random.random(size=(5, 3)).astype(np.float64)
+                label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
+                x = paddle.to_variable(x_np)
+                label = paddle.to_variable(label_np)
+                nll_loss = paddle.nn.loss.NLLLoss(reduction='')
+                res = nll_loss(x, label)
+
+        self.assertRaises(ValueError,
+                          test_NLLLoss_reduction_imperative_not_sum_mean_none)
+
+        def test_nll_loss_function_reduction_not_sum_mean_none():
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            place = paddle.CPUPlace()
+            with paddle.static.program_guard(prog, startup_prog):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+                label = paddle.data(name='label', shape=[10], dtype='int64')
+                res = paddle.nn.functional.nll_loss(x, label, reduction='')
+
+        self.assertRaises(ValueError,
+                          test_nll_loss_function_reduction_not_sum_mean_none)
+
+        def test_nll_loss_function_reduction_imperative_not_sum_mean_none():
+            with fluid.dygraph.guard():
+                x_np = np.random.random(size=(5, 3)).astype(np.float64)
+                label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
+                x = paddle.to_variable(x_np)
+                label = paddle.to_variable(label_np)
+                res = paddle.nn.functional.nll_loss(x, label, reduction='')
+
+        self.assertRaises(
+            ValueError,
+            test_nll_loss_function_reduction_imperative_not_sum_mean_none)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..339f689998f817054611bd85b11945b61d1f649b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.nn.functional as functional
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_attr(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestOneHotOp_exception(unittest.TestCase):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
+        self.x.set(data, self.place)
+        self.x.set_recursive_sequence_lengths(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(core.EnforceNotMet, run)
+
+
+class TestOneHotOpApi(unittest.TestCase):
+    def test_api(self):
+        num_classes = 10
+        self._run(num_classes)
+
+    def test_api_with_depthTensor(self):
+        num_classes = fluid.layers.assign(input=np.array([10], dtype=np.int32))
+        self._run(num_classes)
+
+    def test_api_with_dygraph(self):
+        num_classes = 10
+        label = np.array(
+            [np.random.randint(0, num_classes - 1)
+             for i in range(6)]).reshape([6, 1])
+        with fluid.dygraph.guard():
+            one_hot_label = functional.one_hot(
+                x=fluid.dygraph.to_variable(label), num_classes=num_classes)
+
+    def _run(self, num_classes):
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        one_hot_label = functional.one_hot(x=label, num_classes=num_classes)
+
+        place = fluid.CPUPlace()
+        label_data = np.array([np.random.randint(0, 10 - 1)
+                               for i in range(6)]).reshape([6, 1])
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'label': label_data, },
+                      fetch_list=[one_hot_label],
+                      return_numpy=False)
+
+
+class BadInputTestOnehotV2(unittest.TestCase):
+    def test_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def test_bad_x():
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="float32")
+                one_hot_label = functional.one_hot(x=label, num_classes=4)
+
+            self.assertRaises(TypeError, test_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebe769fb9bce1aee8412ccebc216c2c85e97775
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.static import Program, program_guard
+
+
+def calc_margin_rank_loss(x, y, label, margin=0.0, reduction='none'):
+    result = (-1 * label) * (x - y) + margin
+    result = np.maximum(result, 0)
+    if reduction == 'none':
+        return result
+    elif reduction == 'sum':
+        return np.sum(result)
+    elif reduction == 'mean':
+        return np.mean(result)
+
+
+def create_test_case(margin, reduction):
+    class MarginRankingLossCls(unittest.TestCase):
+        def setUp(self):
+            self.x_data = np.random.rand(10, 10).astype("float64")
+            self.y_data = np.random.rand(10, 10).astype("float64")
+            self.label_data = np.random.choice(
+                [-1, 1], size=[10, 10]).astype("float64")
+            self.places = []
+            self.places.append(fluid.CPUPlace())
+            if core.is_compiled_with_cuda():
+                self.places.append(paddle.CUDAPlace(0))
+
+        def run_static_functional_api(self, place):
+            paddle.enable_static()
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name="x", shape=[10, 10], dtype="float64")
+                y = paddle.static.data(
+                    name="y", shape=[10, 10], dtype="float64")
+                label = paddle.static.data(
+                    name="label", shape=[10, 10], dtype="float64")
+                result = paddle.nn.functional.margin_ranking_loss(
+                    x, y, label, margin, reduction)
+                exe = paddle.static.Executor(place)
+                result_numpy, = exe.run(feed={
+                    "x": self.x_data,
+                    "y": self.y_data,
+                    "label": self.label_data
+                },
+                                        fetch_list=[result])
+                self.assertTrue(np.allclose(result_numpy, expected))
+
+        def run_static_api(self, place):
+            paddle.enable_static()
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name="x", shape=[10, 10], dtype="float64")
+                y = paddle.static.data(
+                    name="y", shape=[10, 10], dtype="float64")
+                label = paddle.static.data(
+                    name="label", shape=[10, 10], dtype="float64")
+                margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                    margin=margin, reduction=reduction)
+                result = margin_rank_loss(x, y, label)
+                exe = paddle.static.Executor(place)
+                result_numpy, = exe.run(feed={
+                    "x": self.x_data,
+                    "y": self.y_data,
+                    "label": self.label_data
+                },
+                                        fetch_list=[result])
+                self.assertTrue(np.allclose(result_numpy, expected))
+                self.assertTrue('loss' in result.name)
+
+        def run_dynamic_functional_api(self, place):
+            paddle.disable_static(place)
+            x = paddle.to_variable(self.x_data)
+            y = paddle.to_variable(self.y_data)
+            label = paddle.to_variable(self.label_data)
+
+            result = paddle.nn.functional.margin_ranking_loss(x, y, label,
+                                                              margin, reduction)
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            self.assertTrue(np.allclose(result.numpy(), expected))
+
+        def run_dynamic_api(self, place):
+            paddle.disable_static(place)
+            x = paddle.to_variable(self.x_data)
+            y = paddle.to_variable(self.y_data)
+            label = paddle.to_variable(self.label_data)
+            margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                margin=margin, reduction=reduction)
+            result = margin_rank_loss(x, y, label)
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                self.label_data,
+                margin=margin,
+                reduction=reduction)
+            self.assertTrue(np.allclose(result.numpy(), expected))
+
+        def run_dynamic_broadcast_api(self, place):
+            paddle.disable_static(place)
+            label_data = np.random.choice([-1, 1], size=[10]).astype("float64")
+            x = paddle.to_variable(self.x_data)
+            y = paddle.to_variable(self.y_data)
+            label = paddle.to_variable(label_data)
+            margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                margin=margin, reduction=reduction)
+            result = margin_rank_loss(x, y, label)
+            expected = calc_margin_rank_loss(
+                self.x_data,
+                self.y_data,
+                label_data,
+                margin=margin,
+                reduction=reduction)
+            self.assertTrue(np.allclose(result.numpy(), expected))
+
+        def test_case(self):
+            for place in self.places:
+                self.run_static_api(place)
+                self.run_static_functional_api(place)
+                self.run_dynamic_api(place)
+                self.run_dynamic_functional_api(place)
+                self.run_dynamic_broadcast_api(place)
+
+    cls_name = "TestMarginRankLossCase_{}_{}".format(margin, reduction)
+    MarginRankingLossCls.__name__ = cls_name
+    globals()[cls_name] = MarginRankingLossCls
+
+
+for margin in [0.0, 0.2]:
+    for reduction in ['none', 'mean', 'sum']:
+        create_test_case(margin, reduction)
+
+
+# test case the raise message
+class MarginRakingLossError(unittest.TestCase):
+    paddle.enable_static()
+
+    def test_errors(self):
+        def test_margin_value_error():
+            margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
+                margin=0.1, reduction="reduce_mean")
+
+        self.assertRaises(ValueError, test_margin_value_error)
+
+        def test_functional_margin_value_error():
+            x = paddle.static.data(name="x", shape=[10, 10], dtype="float64")
+            y = paddle.static.data(name="y", shape=[10, 10], dtype="float64")
+            label = paddle.static.data(
+                name="label", shape=[10, 10], dtype="float64")
+            result = paddle.nn.functional.margin_ranking_loss(
+                x, y, label, margin=0.1, reduction="reduction_mean")
+
+        self.assertRaises(ValueError, test_functional_margin_value_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52a1f5d5b16ca7e0d58230a1a17624e5bff0b02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit, erf
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as functional
+
+
+class TestNNSigmoidAPI(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+
+    def init_data(self):
+        self.x_shape = [10, 15]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.y = self.ref_forward(self.x)
+
+    def ref_forward(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def ref_backward(self, y, dy):
+        return dy * y * (1 - y)
+
+    def check_static_api(self, place):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        mysigmoid = nn.Sigmoid(name="api_sigmoid")
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data(name='x', shape=self.x_shape)
+            x.stop_gradient = False
+            y = mysigmoid(x)
+            fluid.backward.append_backward(paddle.mean(y))
+        exe = paddle.static.Executor(place)
+        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], self.y))
+        self.assertTrue(y.name.startswith("api_sigmoid"))
+
+    def check_dynamic_api(self, place):
+        paddle.disable_static(place)
+        x = paddle.to_variable(self.x)
+        mysigmoid = nn.Sigmoid()
+        y = mysigmoid(x)
+        self.assertTrue(np.allclose(y.numpy(), self.y))
+
+    def test_check_api(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_dynamic_api(place)
+            self.check_static_api(place)
+
+
+class TestNNFunctionalSigmoidAPI(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+
+    def init_data(self):
+        self.x_shape = [10, 15]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.y = self.ref_forward(self.x)
+
+    def ref_forward(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def check_static_api(self, place):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data(name='x', shape=self.x_shape)
+            y = functional.sigmoid(x, name="api_sigmoid")
+        exe = paddle.static.Executor(fluid.CPUPlace())
+        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], self.y))
+
+    def check_dynamic_api(self):
+        paddle.disable_static()
+        x = paddle.to_variable(self.x)
+        y = functional.sigmoid(x)
+        self.assertTrue(np.allclose(y.numpy(), self.y))
+
+    def test_check_api(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_static_api(place)
+            self.check_dynamic_api()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index e6b7a3e7603f53d78052d5de309d6ed7d84c4660..0d083038c6131215dcb572eca1e782c43e82d20a 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -23,16 +23,16 @@ import paddle.fluid as fluid
 
 def p_norm(x, axis, porder, keepdims=False):
     if axis is None: axis = -1
-    xp = np.power(np.abs(x), porder)
-    s = np.sum(xp, axis=axis, keepdims=keepdims)
-    r = np.power(s, 1.0 / porder)
+    r = np.linalg.norm(
+        x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
 
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
     if axis is None: axis = (-2, -1)
-    r = np.linalg.norm(x, ord='fro', axis=axis, keepdims=keepdims)
+    r = np.linalg.norm(
+        x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
 
 
@@ -89,6 +89,7 @@ class TestPnormOp(OpTest):
             'porder': float(self.porder)
         }
         self.outputs = {'Out': norm}
+        self.gradient = self.calc_gradient()
 
     def test_check_output(self):
         self.check_output()
@@ -104,6 +105,34 @@ class TestPnormOp(OpTest):
         self.keepdim = False
         self.dtype = "float64"
 
+    def calc_gradient(self):
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder)
+        }
+        x = self.inputs["X"]
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            grad = np.power(norm, 1 - porder) * np.power(
+                np.abs(x), porder - 1) * np.sign(x)
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        numel /= x.shape[axis]
+        return [grad.astype(x.dtype) * 1 / numel]
+
 
 class TestPnormOp2(TestPnormOp):
     def init_test_case(self):
@@ -118,6 +147,45 @@ class TestPnormOp2(TestPnormOp):
         self.check_grad(['X'], 'Out')
 
 
+class TestPnormOp3(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = np.inf
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+class TestPnormOp4(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = -np.inf
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
+class TestPnormOp5(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 0
+        self.keepdim = True
+        self.dtype = "float32"
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
+
+
 def run_out(self, p, axis, shape_x, shape_y, dtype):
     with fluid.program_guard(fluid.Program()):
         data1 = fluid.data(name="X", shape=shape_x, dtype=dtype)
@@ -170,6 +238,9 @@ class API_NormTest(unittest.TestCase):
         run_fro(self, p='fro', axis=[0, 1], shape_x=[3, 3, 4], dtype="float64")
         run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
         run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=np.inf, axis=1, shape_x=[3, 4], dtype="float32")
+        run_pnorm(self, p=-np.inf, axis=1, shape_x=[3, 4], dtype="float64")
+        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
 
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d9af4d50be77bd1d2ecc11dd872ef612209f1e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -0,0 +1,197 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import copy
+
+np.random.seed(10)
+
+
+class TestNormalAPI(unittest.TestCase):
+    def setUp(self):
+        self.mean = 1.0
+        self.std = 0.0
+        self.shape = None
+        self.repeat_num = 1000
+        self.set_attrs()
+        self.dtype = self.get_dtype()
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        self.shape = [8, 12]
+
+    def get_shape(self):
+        if isinstance(self.mean, np.ndarray):
+            shape = self.mean.shape
+        elif isinstance(self.std, np.ndarray):
+            shape = self.std.shape
+        else:
+            shape = self.shape
+        return list(shape)
+
+    def get_dtype(self):
+        if isinstance(self.mean, np.ndarray):
+            return self.mean.dtype
+        elif isinstance(self.std, np.ndarray):
+            return self.std.dtype
+        else:
+            return 'float32'
+
+    def static_api(self):
+        shape = self.get_shape()
+        ret_all_shape = copy.deepcopy(shape)
+        ret_all_shape.insert(0, self.repeat_num)
+        ret_all = np.zeros(ret_all_shape, self.dtype)
+        if isinstance(self.mean, np.ndarray) \
+            and isinstance(self.std, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                out = paddle.normal(mean, std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={
+                        'Mean': self.mean,
+                        'Std': self.std.reshape(shape)
+                    },
+                                  fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        elif isinstance(self.mean, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                out = paddle.normal(mean, self.std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={'Mean': self.mean}, fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        elif isinstance(self.std, np.ndarray):
+            with paddle.static.program_guard(paddle.static.Program()):
+                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                out = paddle.normal(self.mean, std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(feed={'Std': self.std}, fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+        else:
+            with paddle.static.program_guard(paddle.static.Program()):
+                out = paddle.normal(self.mean, self.std, self.shape)
+
+                exe = paddle.static.Executor(self.place)
+                for i in range(self.repeat_num):
+                    ret = exe.run(fetch_list=[out])
+                    ret_all[i] = ret[0]
+            return ret_all
+
+    def dygraph_api(self):
+        paddle.disable_static(self.place)
+        shape = self.get_shape()
+        ret_all_shape = copy.deepcopy(shape)
+        ret_all_shape.insert(0, self.repeat_num)
+        ret_all = np.zeros(ret_all_shape, self.dtype)
+
+        mean = paddle.to_tensor(self.mean) \
+            if isinstance(self.mean, np.ndarray) else self.mean
+        std = paddle.to_tensor(self.std) \
+            if isinstance(self.std, np.ndarray) else self.std
+        for i in range(self.repeat_num):
+            out = paddle.normal(mean, std, self.shape)
+            ret_all[i] = out.numpy()
+        paddle.enable_static()
+        return ret_all
+
+    def test_api(self):
+        ret_static = self.static_api()
+        ret_dygraph = self.dygraph_api()
+        for ret in [ret_static, ret_dygraph]:
+            shape_ref = self.get_shape()
+            self.assertEqual(shape_ref, list(ret[0].shape))
+
+            ret = ret.flatten().reshape([self.repeat_num, -1])
+            mean = np.mean(ret, axis=0)
+            std = np.std(ret, axis=0)
+            mean_ref=self.mean.reshape([1, -1]) \
+                if isinstance(self.mean, np.ndarray) else self.mean
+            std_ref=self.std.reshape([1, -1]) \
+                if isinstance(self.std, np.ndarray) else self.std
+            self.assertTrue(np.allclose(mean_ref, mean, 0.1, 0.1))
+            self.assertTrue(np.allclose(std_ref, std, 0.1, 0.1))
+
+
+class TestNormalAPI_mean_is_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(-2, -1, [2, 3, 4, 5]).astype('float64')
+
+
+class TestNormalAPI_std_is_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.std = np.random.uniform(0.7, 1, [2, 3, 17]).astype('float64')
+
+
+class TestNormalAPI_mean_std_are_tensor(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(1, 2, [1, 100]).astype('float64')
+        self.std = np.random.uniform(0.5, 1, [1, 100]).astype('float64')
+
+
+class TestNormalAPI_mean_std_are_tensor_with_different_dtype(TestNormalAPI):
+    def set_attrs(self):
+        self.mean = np.random.uniform(1, 2, [100]).astype('float64')
+        self.std = np.random.uniform(1, 2, [100]).astype('float32')
+
+
+class TestNormalAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        shape = [1, 2, 3]
+        out1 = paddle.normal(shape=shape)
+        out2 = paddle.tensor.normal(shape=shape)
+        out3 = paddle.tensor.random.normal(shape=shape)
+        paddle.enable_static()
+
+
+class TestNormalErrors(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            mean = [1, 2, 3]
+            self.assertRaises(TypeError, paddle.normal, mean)
+
+            std = [1, 2, 3]
+            self.assertRaises(TypeError, paddle.normal, std=std)
+
+            mean = paddle.data('Mean', [100], 'int32')
+            self.assertRaises(TypeError, paddle.normal, mean)
+
+            std = paddle.data('Std', [100], 'int32')
+            self.assertRaises(TypeError, paddle.normal, mean=1.0, std=std)
+
+            self.assertRaises(TypeError, paddle.normal, shape=1)
+
+            self.assertRaises(TypeError, paddle.normal, shape=[1.0])
+
+            shape = paddle.data('Shape', [100], 'float32')
+            self.assertRaises(TypeError, paddle.normal, shape=shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..6595a29b24ae23c9b38538035c9593ba77eecdb7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+
+
+def p_normalize(x, axis=1, p=2, epsilon=1e-12, keepdims=True):
+    if len(x.shape) == 1:
+        axis = 0
+    xp = np.power(np.abs(x), p)
+    s = np.sum(xp, axis=axis, keepdims=keepdims)
+    r = np.maximum(np.power(s, 1.0 / p), epsilon)
+    return x / r
+
+
+class TestNNFunctionalNormalize(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
+        self.input_np2 = np.array([0.0, 0.0]).astype(np.float32)
+        self.expected0 = p_normalize(self.input_np)
+        self.expected1 = p_normalize(self.input_np, p=1.5)
+        self.expected2 = p_normalize(self.input_np, axis=0)
+        self.expected3 = p_normalize(self.input_np2)
+
+    def run_imperative(self):
+        x = paddle.to_variable(self.input_np)
+        y = F.normalize(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = F.normalize(x, p=1.5)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = F.normalize(x, axis=0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_variable(self.input_np2)
+        y = F.normalize(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.data(name='input2', shape=[2], dtype='float32')
+        result0 = F.normalize(x)
+        result1 = F.normalize(x, p=1.5)
+        result2 = F.normalize(x, axis=0)
+        result3 = F.normalize(x, name='aaa')
+        result4 = F.normalize(x2)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        static_result = exe.run(
+            feed={"input": self.input_np,
+                  "input2": self.input_np2},
+            fetch_list=[result0, result1, result2, result4])
+
+        self.assertTrue(np.allclose(static_result[0], self.expected0))
+        self.assertTrue(np.allclose(static_result[1], self.expected1))
+        self.assertTrue(np.allclose(static_result[2], self.expected2))
+        self.assertTrue('aaa' in result3.name)
+        self.assertTrue(np.allclose(static_result[3], self.expected3))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8512bc99e7451c73e5513b834fb6aa448717c646
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import functools
+import paddle
+
+
+class TestNumelOp(OpTest):
+    def setUp(self):
+        self.op_type = "size"
+        self.init()
+        x = np.random.random((self.shape)).astype("float64")
+        self.inputs = {'Input': x, }
+        self.outputs = {'Out': np.array([np.size(x)])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init(self):
+        self.shape = (6, 56, 8, 55)
+
+
+class TestNumelOp1(TestNumelOp):
+    def init(self):
+        self.shape = (11, 66)
+
+
+class TestNumelOp2(TestNumelOp):
+    def init(self):
+        self.shape = (0, )
+
+
+class TestNumelOoAPI(unittest.TestCase):
+    def test_numel_static(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            shape1 = [2, 1, 4, 5]
+            shape2 = [1, 4, 5]
+            x_1 = paddle.data(shape=shape1, dtype='int32', name='x_1')
+            x_2 = paddle.data(shape=shape2, dtype='int32', name='x_2')
+            input_1 = np.random.random(shape1).astype("int32")
+            input_2 = np.random.random(shape2).astype("int32")
+            out_1 = paddle.numel(x_1)
+            out_2 = paddle.numel(x_2)
+            exe = paddle.static.Executor(place=paddle.CPUPlace())
+            res_1, res_2 = exe.run(feed={
+                "x_1": input_1,
+                "x_2": input_2,
+            },
+                                   fetch_list=[out_1, out_2])
+            assert (np.array_equal(
+                res_1, np.array([np.size(input_1)]).astype("int64")))
+            assert (np.array_equal(
+                res_2, np.array([np.size(input_2)]).astype("int64")))
+
+    def test_numel_imperative(self):
+        paddle.disable_static(paddle.CPUPlace())
+        input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
+        input_2 = np.random.random([1, 4, 5]).astype("int32")
+        x_1 = paddle.to_variable(input_1)
+        x_2 = paddle.to_variable(input_2)
+        out_1 = paddle.numel(x_1)
+        out_2 = paddle.numel(x_2)
+        assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
+        assert (np.array_equal(out_2.numpy().item(0), np.size(input_2)))
+        paddle.enable_static()
+
+    def test_error(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+
+            def test_x_type():
+                shape = [1, 4, 5]
+                input_1 = np.random.random(shape).astype("int32")
+                out_1 = paddle.numel(input_1)
+
+            self.assertRaises(TypeError, test_x_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index 4e3b3f3edc9f92a2b268586f79dbcc3aafc05031..c1e6a3377710f98184e9541e287b911def89cd81 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -62,18 +62,18 @@ class TestOnesLikeImpeartive(unittest.TestCase):
         shape = [3, 4]
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with paddle.imperative.guard(place):
-            x = paddle.imperative.to_variable(np.ones(shape))
-            for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
-                out = ones_like(x, dtype)
-                self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(),
-                                 True)
-
-            out = paddle.tensor.ones_like(x)
+        paddle.disable_static(place)
+        x = paddle.to_variable(np.ones(shape))
+        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+            out = ones_like(x, dtype)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
 
-            out = paddle.tensor.creation.ones_like(x)
-            self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
+        out = paddle.tensor.ones_like(x)
+        self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
+
+        out = paddle.tensor.creation.ones_like(x)
+        self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_ones_op.py b/python/paddle/fluid/tests/unittests/test_ones_op.py
index d50e820c6c6bc89a9346382c79f057e179f1da12..47ce37964324208a032c821360d6ab10666abcb5 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_op.py
@@ -27,35 +27,35 @@ import numpy as np
 
 class ApiOnesTest(unittest.TestCase):
     def test_paddle_ones(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10])
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="float32")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10], dtype="float64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
     def test_fluid_ones(self):
-        with paddle.program_guard(paddle.Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             ones = fluid.layers.ones(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -64,25 +64,25 @@ class ApiOnesTest(unittest.TestCase):
 class ApiOnesZerosError(unittest.TestCase):
     def test_errors(self):
         def test_error1():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = paddle.ones(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error1)
 
         def test_error2():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = paddle.ones(shape=10)
 
         self.assertRaises(TypeError, test_error2)
 
         def test_error3():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = fluid.layers.ones(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error3)
 
         def test_error4():
-            with paddle.program_guard(paddle.Program()):
+            with paddle.static.program_guard(paddle.static.Program()):
                 ones = fluid.layers.ones(shape=[10], dtype="int8")
 
         self.assertRaises(TypeError, test_error4)
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..68589e6d8182f9e6dfdabfc7bce4c20bec521740
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -0,0 +1,670 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+
+from paddle.fluid import Program, program_guard, Executor, default_main_program
+
+
+class TestPad3dOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.value = 0.0
+        self.variable_paddings = False
+        self.initTestCase()
+        self.op_type = "pad3d"
+        self.inputs = {'X': np.random.random(self.shape).astype("float64")}
+        self.attrs = {}
+        if self.variable_paddings:
+            self.attrs['paddings'] = []
+            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        else:
+            self.attrs['paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        self.attrs['value'] = self.value
+        self.attrs['mode'] = self.mode
+        self.attrs['data_format'] = self.data_format
+        if self.data_format == "NCDHW":
+            paddings = [
+                (0, 0),
+                (0, 0),
+                (self.paddings[4], self.paddings[5]),
+                (self.paddings[2], self.paddings[3]),
+                (self.paddings[0], self.paddings[1]),
+            ]
+        else:
+            paddings = [
+                (0, 0),
+                (self.paddings[4], self.paddings[5]),
+                (self.paddings[2], self.paddings[3]),
+                (self.paddings[0], self.paddings[1]),
+                (0, 0),
+            ]
+        if self.mode == "constant":
+            out = np.pad(self.inputs['X'],
+                         paddings,
+                         mode=self.mode,
+                         constant_values=self.value)
+        elif self.mode == "reflect":
+            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
+        elif self.mode == "replicate":
+            out = np.pad(self.inputs['X'], paddings, mode="edge")
+        elif self.mode == "circular":
+            out = np.pad(self.inputs['X'], paddings, mode="wrap")
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 0, 0, 0, 0, 0]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 4, 5]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.value = 1.0
+
+
+class TestCase2(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [1, 1, 1, 1, 1, 1]
+        self.mode = "constant"
+        self.data_format = "NDHWC"
+        self.value = 1.0
+
+
+class TestCase3(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 1, 0, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCDHW"
+
+
+class TestCase4(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [0, 1, 2, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NDHWC"
+
+
+class TestCase5(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 2, 1]
+        self.mode = "replicate"
+        self.data_format = "NCDHW"
+
+
+class TestCase6(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [5, 4, 2, 1, 2, 3]
+        self.mode = "replicate"
+        self.data_format = "NDHWC"
+
+
+class TestCase7(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 2, 1]
+        self.mode = "circular"
+        self.data_format = "NCDHW"
+
+
+class TestCase8(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (4, 4, 4, 4, 4)
+        self.paddings = [0, 1, 2, 1, 2, 3]
+        self.mode = "circular"
+        self.data_format = "NDHWC"
+
+
+class TestPadAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def check_static_result_1(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (1, 2, 3, 4, 5)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "constant"
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result = F.pad(x=x, pad=pad, value=value, mode=mode)
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result])
+
+            np_out = self._get_numpy_out(input_data, pad, mode, value)
+            self.assertTrue(np.allclose(fetches[0], np_out))
+
+    def check_static_result_2(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 1, 2]
+            mode = "reflect"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def check_static_result_3(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "replicate"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def check_static_result_4(self, place):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_shape = (2, 3, 4, 5, 6)
+            pad = [1, 2, 1, 1, 3, 4]
+            mode = "circular"
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
+            result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
+            exe = Executor(place)
+            fetches = exe.run(default_main_program(),
+                              feed={"x": input_data},
+                              fetch_list=[result1, result2])
+
+            np_out1 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NCDHW")
+            np_out2 = self._get_numpy_out(
+                input_data, pad, mode, data_format="NDHWC")
+            self.assertTrue(np.allclose(fetches[0], np_out1))
+            self.assertTrue(np.allclose(fetches[1], np_out2))
+
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0,
+                       data_format="NCDHW"):
+        if data_format == "NCDHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NDHWC":
+            pad = [
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+        elif data_format == "NCHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NHWC":
+            pad = [
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+        elif data_format == "NCL":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[0], pad[1]),
+            ]
+        elif data_format == "NLC":
+            pad = [
+                (0, 0),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+        elif mode == "circular":
+            out = np.pad(input_data, pad, mode="wrap")
+
+        return out
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result_1(place=place)
+            self.check_static_result_2(place=place)
+            self.check_static_result_3(place=place)
+            self.check_static_result_4(place=place)
+
+    def test_dygraph_1(self):
+        paddle.disable_static()
+
+        input_shape = (1, 2, 3, 4, 5)
+        pad = [1, 2, 1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCDHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NDHWC")
+        tensor_data = paddle.to_tensor(input_data)
+
+        y1 = F.pad(tensor_data,
+                   pad=pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCDHW")
+        y2 = F.pad(tensor_data,
+                   pad=pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NDHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        input_shape = (2, 3, 4, 5)
+        pad = [1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NHWC")
+
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_2(self):
+        paddle.disable_static()
+
+        input_shape = (2, 3, 4, 5)
+        pad = [1, 1, 3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCHW")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NHWC")
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCHW")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NHWC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+    def test_dygraph_3(self):
+        paddle.disable_static()
+
+        input_shape = (3, 4, 5)
+        pad = [3, 4]
+        mode = "constant"
+        value = 100
+        input_data = np.random.rand(*input_shape).astype(np.float32)
+        np_out1 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NCL")
+        np_out2 = self._get_numpy_out(
+            input_data, pad, mode, value, data_format="NLC")
+        tensor_data = paddle.to_tensor(input_data)
+        tensor_pad = paddle.to_tensor(pad, dtype="int32")
+
+        y1 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NCL")
+        y2 = F.pad(tensor_data,
+                   pad=tensor_pad,
+                   mode=mode,
+                   value=value,
+                   data_format="NLC")
+
+        self.assertTrue(np.allclose(y1.numpy(), np_out1))
+        self.assertTrue(np.allclose(y2.numpy(), np_out2))
+
+
+class TestPad1dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCL"):
+        if data_format == "NCL":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5)
+            pad = [1, 2]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection = nn.ReflectionPad1d(padding=pad)
+            pad_replication = nn.ReplicationPad1d(padding=pad)
+            pad_constant = nn.ConstantPad1d(padding=pad, value=value)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad2dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCHW"):
+        if data_format == "NCHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6)
+            pad = [1, 2, 2, 1]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection = nn.ReflectionPad2d(padding=pad)
+            pad_replication = nn.ReplicationPad2d(padding=pad)
+            pad_constant = nn.ConstantPad2d(padding=pad, value=value)
+            pad_zero = nn.ZeroPad2d(padding=pad)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_zero(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=0, data_format="NCHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad3dAPI(unittest.TestCase):
+    def _get_numpy_out(self,
+                       input_data,
+                       pad,
+                       mode,
+                       value=0.0,
+                       data_format="NCDHW"):
+        if data_format == "NCDHW":
+            pad = [
+                (0, 0),
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+            ]
+        else:
+            pad = [
+                (0, 0),
+                (pad[4], pad[5]),
+                (pad[2], pad[3]),
+                (pad[0], pad[1]),
+                (0, 0),
+            ]
+
+        if mode == "constant":
+            out = np.pad(input_data, pad, mode=mode, constant_values=value)
+        elif mode == "reflect":
+            out = np.pad(input_data, pad, mode=mode)
+        elif mode == "replicate":
+            out = np.pad(input_data, pad, mode="edge")
+
+        return out
+
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_class(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6, 7)
+            pad = [1, 2, 2, 1, 1, 0]
+            value = 100
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_replication = nn.ReplicationPad3d(padding=pad)
+            pad_constant = nn.ConstantPad3d(padding=pad, value=value)
+
+            data = paddle.to_tensor(input_data)
+
+            output = pad_replication(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "replicate", data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_constant(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "constant", value=value, data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+
+class TestPad3dOpError(unittest.TestCase):
+    def test_errors(self):
+        def test_variable():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            F.pad(x=data, paddings=[1, 1, 1, 1, 1, 1])
+
+        def test_reflect_1():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[5, 6, 1, 1, 1, 1], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        def test_reflect_2():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[1, 1, 4, 3, 1, 1], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        def test_reflect_3():
+            input_shape = (1, 2, 3, 4, 5)
+            data = np.random.rand(*input_shape).astype(np.float32)
+            x = paddle.data(name="x", shape=input_shape)
+            y = F.pad(x, pad=[1, 1, 1, 1, 2, 3], value=1, mode='reflect')
+            place = paddle.CPUPlace()
+            exe = Executor(place)
+            outputs = exe.run(feed={'x': data}, fetch_list=[y.name])
+
+        self.assertRaises(TypeError, test_variable)
+
+        self.assertRaises(Exception, test_reflect_1)
+
+        self.assertRaises(Exception, test_reflect_2)
+
+        self.assertRaises(Exception, test_reflect_3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 50e587478957a9e5c359d0c8a9d606859f17e994..858d56c1fc04f61c9dd281a633f7be9aceff8338 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -22,7 +22,7 @@ import paddle
 
 def _dygraph_guard_(func):
     def __impl__(*args, **kwargs):
-        if paddle.in_imperative_mode():
+        if paddle.in_dynamic_mode():
             return func(*args, **kwargs)
         else:
             with fluid.dygraph.guard():
@@ -54,7 +54,7 @@ class TestDygraphDoubleGrad(TestCase):
              allow_unused=False):
         backward_strategy = fluid.dygraph.BackwardStrategy()
         backward_strategy.sort_sum_gradient = self.sort_sum_gradient
-        return paddle.imperative.grad(
+        return paddle.grad(
             outputs=outputs,
             inputs=inputs,
             grad_outputs=grad_outputs,
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf0efa6ec2e7edafb8d331423a7b47155283c21
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False):
+    return np.linalg.norm(x - y, ord=p, axis=1, keepdims=keepdim)
+
+
+def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+
+    place = fluid.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda(
+    ) else fluid.CPUPlace()
+
+    with paddle.static.program_guard(prog, startup_prog):
+        x = paddle.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+        y = paddle.data(name='y', shape=y_np.shape, dtype=x_np.dtype)
+        dist = paddle.nn.layer.distance.PairwiseDistance(
+            p=p, epsilon=epsilon, keepdim=keepdim)
+        distance = dist(x, y)
+        exe = paddle.static.Executor(place)
+        static_ret = exe.run(prog,
+                             feed={'x': x_np,
+                                   'y': y_np},
+                             fetch_list=[distance])
+        static_ret = static_ret[0]
+    return static_ret
+
+
+def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
+    paddle.disable_static()
+    x = paddle.to_variable(x_np)
+    y = paddle.to_variable(y_np)
+    dist = paddle.nn.layer.distance.PairwiseDistance(
+        p=p, epsilon=epsilon, keepdim=keepdim)
+    distance = dist(x, y)
+    dygraph_ret = distance.numpy()
+    paddle.enable_static()
+    return dygraph_ret
+
+
+class TestPairwiseDistance(unittest.TestCase):
+    def test_pairwise_distance(self):
+        all_shape = [[100, 100], [4, 5, 6, 7]]
+        dtypes = ['float32', 'float64']
+        keeps = [False, True]
+        for shape in all_shape:
+            for dtype in dtypes:
+                for keepdim in keeps:
+                    x_np = np.random.random(shape).astype(dtype)
+                    y_np = np.random.random(shape).astype(dtype)
+
+                    static_ret = test_static(x_np, y_np, keepdim=keepdim)
+                    dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
+                    excepted_value = pairwise_distance(
+                        x_np, y_np, keepdim=keepdim)
+
+                    self.assertTrue(np.allclose(static_ret, dygraph_ret))
+                    self.assertTrue(np.allclose(static_ret, excepted_value))
+                    self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+
+    def test_pairwise_distance_broadcast(self):
+        shape_x = [100, 100]
+        shape_y = [100, 1]
+        keepdim = False
+        x_np = np.random.random(shape_x).astype('float32')
+        y_np = np.random.random(shape_y).astype('float32')
+        static_ret = test_static(x_np, y_np, keepdim=keepdim)
+        dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
+        excepted_value = pairwise_distance(x_np, y_np, keepdim=keepdim)
+        self.assertTrue(np.allclose(static_ret, dygraph_ret))
+        self.assertTrue(np.allclose(static_ret, excepted_value))
+        self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+
+    def test_pairwise_distance_different_p(self):
+        shape = [100, 100]
+        keepdim = False
+        p = 3.0
+        x_np = np.random.random(shape).astype('float32')
+        y_np = np.random.random(shape).astype('float32')
+        static_ret = test_static(x_np, y_np, p=p, keepdim=keepdim)
+        dygraph_ret = test_dygraph(x_np, y_np, p=p, keepdim=keepdim)
+        excepted_value = pairwise_distance(x_np, y_np, p=p, keepdim=keepdim)
+        self.assertTrue(np.allclose(static_ret, dygraph_ret))
+        self.assertTrue(np.allclose(static_ret, excepted_value))
+        self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf1e9711b74b31e15b732f87addbc9fa653152f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import paddle.fluid as fluid
+
+import os
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestParallelDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_sync_batch_norm.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index fe31add697c65671eec12e8727499513129b1f05..dd1cf29eff9b7545121ac37908c4045dc924ceb0 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 0bcb4be3b7fb9380932cf137ac8e4939dcd77288..cf93f39ab8c5c92aa075f2f0a7dca9a5c5d9f485 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -16,16 +16,17 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 
 
-class TestPixelShuffle(OpTest):
-    def setUp(self):
-        self.op_type = "pixel_shuffle"
-        n, c, h, w = 2, 9, 4, 4
-        up_factor = 3
-        shape = [n, c, h, w]
-        x = np.random.random(shape).astype("float64")
+def pixel_shuffle_np(x, up_factor, data_format="NCHW"):
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
         new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
                      w)
         # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
@@ -34,10 +35,42 @@ class TestPixelShuffle(OpTest):
         npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
         oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
         npresult = np.reshape(npresult, oshape)
+        return npresult
+    else:
+        n, h, w, c = x.shape
+        new_shape = (n, h, w, c // (up_factor * up_factor), up_factor,
+                     up_factor)
+        # reshape to (num,h,w,output_channel,upscale_factor,upscale_factor)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,h,upscale_factor,w,upscale_factor,output_channel)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, h * up_factor, w * up_factor, c // (up_factor * up_factor)]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+
+
+class TestPixelShuffleOp(OpTest):
+    def setUp(self):
+        self.op_type = "pixel_shuffle"
+        self.init_data_format()
+        n, c, h, w = 2, 9, 4, 4
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        up_factor = 3
+
+        x = np.random.random(shape).astype("float64")
+        npresult = pixel_shuffle_np(x, up_factor, self.format)
 
         self.inputs = {'X': x}
         self.outputs = {'Out': npresult}
-        self.attrs = {'upscale_factor': up_factor}
+        self.attrs = {'upscale_factor': up_factor, "data_format": self.format}
+
+    def init_data_format(self):
+        self.format = "NCHW"
 
     def test_check_output(self):
         self.check_output()
@@ -46,5 +79,141 @@ class TestPixelShuffle(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestChannelLast(TestPixelShuffleOp):
+    def init_data_format(self):
+        self.format = "NHWC"
+
+
+class TestPixelShuffleAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
+        self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
+        self.out_1_np = pixel_shuffle_np(self.x_1_np, 3)
+        self.out_2_np = pixel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+    def test_static_graph_functional(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            out_1 = F.pixel_shuffle(x_1, 3)
+            out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, self.out_1_np)
+            assert np.allclose(res_2, self.out_2_np)
+
+    # same test between layer and functional in this op.
+    def test_static_graph_layer(self):
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            # init instance
+            ps_1 = paddle.nn.PixelShuffle(3)
+            ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
+            out_1 = ps_1(x_1)
+            out_2 = ps_2(x_2)
+            out_1_np = pixel_shuffle_np(self.x_1_np, 3)
+            out_2_np = pixel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, out_1_np)
+            assert np.allclose(res_2, out_2_np)
+
+    def run_dygraph(self, up_factor, data_format):
+
+        n, c, h, w = 2, 9, 4, 4
+
+        if data_format == "NCHW":
+            shape = [n, c, h, w]
+        if data_format == "NHWC":
+            shape = [n, h, w, c]
+
+        x = np.random.random(shape).astype("float64")
+
+        npresult = pixel_shuffle_np(x, up_factor, data_format)
+
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.disable_static(place=place)
+
+            pixel_shuffle = paddle.nn.PixelShuffle(
+                up_factor, data_format=data_format)
+            result = pixel_shuffle(paddle.to_tensor(x))
+
+            self.assertTrue(np.allclose(result.numpy(), npresult))
+
+            result_functional = F.pixel_shuffle(
+                paddle.to_tensor(x), 3, data_format)
+            self.assertTrue(np.allclose(result_functional.numpy(), npresult))
+
+    def test_dygraph1(self):
+        self.run_dygraph(3, "NCHW")
+
+    def test_dygraph2(self):
+        self.run_dygraph(3, "NHWC")
+
+
+class TestPixelShuffleError(unittest.TestCase):
+    def test_error_functional(self):
+        def error_upscale_factor():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                pixel_shuffle = F.pixel_shuffle(paddle.to_tensor(x), 3.33)
+
+        self.assertRaises(TypeError, error_upscale_factor)
+
+        def error_data_format():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                pixel_shuffle = F.pixel_shuffle(paddle.to_tensor(x), 3, "WOW")
+
+        self.assertRaises(ValueError, error_data_format)
+
+    def test_error_layer(self):
+        def error_upscale_factor_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                ps = paddle.nn.PixelShuffle(3.33)
+
+        self.assertRaises(TypeError, error_upscale_factor_layer)
+
+        def error_data_format_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                ps = paddle.nn.PixelShuffle(3, "MEOW")
+
+        self.assertRaises(ValueError, error_data_format_layer)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1a25ad3529e8b0a4126bc458838ecd876e5af30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+
+
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        out[:, :, i] = np.max(x_masked, axis=(2))
+    return out
+
+
+def avg_pool1D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=False,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, L = x.shape
+    if global_pool == 1:
+        ksize = [L]
+    if adaptive:
+        L_out = ksize[0]
+    else:
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+
+    out = np.zeros((N, C, L_out))
+    for i in range(L_out):
+        if adaptive:
+            r_start = adaptive_start_index(i, L, ksize[0])
+            r_end = adaptive_end_index(i, L, ksize[0])
+        else:
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        x_masked = x[:, :, r_start:r_end]
+
+        field_size = (r_end - r_start) \
+            if (exclusive or adaptive) else (ksize[0])
+        if data_type == np.int8 or data_type == np.uint8:
+            out[:, :, i] = (np.rint(
+                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+        else:
+            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
+                            field_size).astype(data_type)
+    return out
+
+
+class TestPool1d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0], ceil_mode=False)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=[0])
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool1d_dg = paddle.nn.layer.AvgPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = avg_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.max_pool1d(input, kernel_size=2, stride=2, padding=[0])
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.max_pool1d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+                output_size=16)
+            result = ada_max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_adaptive_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_max_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_adaptive_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
+            result = F.adaptive_avg_pool1d(input, output_size=16)
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_padding_same(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.max_pool1d(
+                input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_padding_same(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(
+                input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool1d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_adaptive_max_dygraph_results(place)
+            self.check_adaptive_avg_dygraph_results(place)
+            self.check_adaptive_max_static_results(place)
+            self.check_adaptive_avg_static_results(place)
+            self.check_max_dygraph_padding_same(place)
+            self.check_avg_dygraph_padding_same(place)
+
+
+class TestPool2dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[2]]
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[2]]
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = F.max_pool1d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.max_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.max_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = F.avg_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "paddle"
+                res_pd = F.avg_pool1d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True)
+
+        self.assertRaises(ValueError, run7)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..73df0885d8fed4ddc4c03c91d2c331e72772e398
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive
+import unittest
+from op_test import OpTest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.nn.functional import *
+import paddle.fluid as fluid
+import paddle
+
+
+class TestPool2d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            result = max_pool2d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=0, return_indices=False)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, indices = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=None,
+                padding="SAME",
+                return_indices=True)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max',
+                padding_algorithm="SAME")
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=None, padding="SAME")
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg',
+                padding_algorithm="SAME")
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
+            result = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                return_indices=False)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_divisor(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
+            result = avg_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=4)
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='avg')
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool2d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_max_dygraph_stride_is_none(place)
+            self.check_avg_dygraph_stride_is_none(place)
+            self.check_max_dygraph_padding(place)
+            self.check_avg_divisor(place)
+
+
+class TestPool2dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
+                res_pd = max_pool2d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run3)
+
+        def run3_avg():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run3_avg)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run4)
+
+        def run4_avg():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run4_avg)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "padding"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=True,
+                    data_format='NHWC')
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = avg_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=False,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run7)
+
+        def run8():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = "VALID"
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    ceil_mode=False,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index a9fdcd55f74cd53824016765fe82a03190f23f89..a12a328b653b26dac31b11839d02e867a012c4bc 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -517,6 +517,19 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 6, 6]
+
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+
+
 #-------test pool2d with asymmetric padding-----
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc078e9aae7aafe55e937b80270dd012fd64ff70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -0,0 +1,341 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.nn.functional import avg_pool3d, max_pool3d
+from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive
+
+
+class TestPool3d_API(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_avg_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            result = avg_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_avg_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(input, kernel_size=2, stride=2, padding="SAME")
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg',
+                padding_algorithm="SAME")
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=None, padding="SAME")
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_static_results(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], result_np))
+
+    def check_max_dygraph_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_stride_is_none(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, indices = max_pool3d(
+                input,
+                kernel_size=2,
+                stride=None,
+                padding="SAME",
+                return_indices=True)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max',
+                padding_algorithm="SAME")
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            padding = [0, 0, 0, 0, 0, 0]
+            result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_divisor(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            padding = 0
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=8)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='avg')
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=2, padding=0)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            padding = [0, 0, 0, 0, 0, 0]
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=padding,
+                divisor_override=8)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def test_pool3d(self):
+        for place in self.places:
+
+            self.check_max_dygraph_results(place)
+            self.check_avg_dygraph_results(place)
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_max_dygraph_stride_is_none(place)
+            self.check_max_dygraph_padding(place)
+            self.check_avg_divisor(place)
+
+
+class TestPool3dError_API(unittest.TestCase):
+    def test_error_api(self):
+        def run1():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd, kernel_size=2, stride=2, padding=padding)
+
+        self.assertRaises(ValueError, run1)
+
+        def run2():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NCDHW')
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=padding,
+                    data_format='NDHWC')
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run5)
+
+        def run6():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="padding",
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run6)
+
+        def run7():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="padding",
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run7)
+
+        def run8():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = avg_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="VALID",
+                    ceil_mode=True,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run8)
+
+        def run9():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding="VALID",
+                    ceil_mode=True,
+                    data_format='NNNN')
+
+        self.assertRaises(ValueError, run9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index ade7e9f50fd27a3bd4084a628eff445e0d81db0d..3d139e9b90c10e352599d506fb3ca837472348f5 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -453,6 +453,18 @@ class TestAvgPoolAdaptive(TestCase1):
         self.adaptive = True
 
 
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [8, 3, 2, 4, 4]
+
+    def init_test_case(self):
+        self.ksize = [2, 2, 3]
+        self.strides = [1, 1, 1]
+
+
 #-------test pool3d with asymmetric padding------
 class TestPool3d_Op_AsyPadding(TestPool3d_Op):
     def init_test_case(self):
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 398ad9aa698e5e7cb4102a72955c2d33e6a7e7a9..16388ff8f5f042326ac5705a5f69919f4f8061c2 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -18,23 +18,134 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 import six
-import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.nn.functional as F
+
+
+def ref_prelu(x, weight):
+    x_t = x.copy()
+    weight = weight.reshape(1, -1, 1, 1)
+    neg_indices = x <= 0
+    assert x.shape == neg_indices.shape
+    x_t[neg_indices] = (x_t * weight)[neg_indices]
+    return (x_t, )
+
+
+def ref_prelu_nn(x, num_parameters, init):
+    weight_np = np.full((num_parameters), init)
+    return ref_prelu(x, weight_np)
 
 
-class TestPReluOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
+class TestFunctionalPReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float32')
+        self.weight_np_0 = np.random.randn(1).astype('float32')
+        self.weight_np_1 = np.random.randn(self.x_np.shape[1]).astype('float32')
+
+    def static_check(self, weight_np):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, 'float32')
+            weight = paddle.data('Alpha', weight_np.shape, 'float32')
+            out = F.prelu(x, weight)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np,
+                                'Alpha': weight_np},
+                          fetch_list=[out])
+        out_ref = ref_prelu(self.x_np, weight_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def dygraph_check(self, weight_np):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        weight = paddle.to_tensor(weight_np)
+        out = F.prelu(x, weight)
+        out_ref = ref_prelu(self.x_np, weight_np)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+        paddle.enable_static()
+
+    def test_static_api(self):
+        self.static_check(self.weight_np_0)
+        self.static_check(self.weight_np_1)
+
+    def test_dygraph_api(self):
+        self.dygraph_check(self.weight_np_0)
+        self.dygraph_check(self.weight_np_1)
+
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            weight_fp32 = paddle.data(
+                name='weight_fp32', shape=[1], dtype='float32')
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.prelu, 0.1, 'all')
+            self.assertRaises(TypeError, F.prelu, x=1, weight=weight_fp32)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.prelu, x_int32, 'all')
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.prelu(x_fp16, 'all')
+            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, F.prelu, x=x_int32, weight=weight_fp32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            F.prelu(x=x_fp16, weight=weight_fp32)
+
+
+class TestNNPReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
+
+    def test_static_api(self):
+        startup_program = paddle.static.Program()
+        train_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.data(name='X', shape=self.x_np.shape, dtype='float32')
+            m = paddle.nn.PReLU()
+            out = m(x)
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+            res = exe.run(train_program,
+                          feed={'X': self.x_np},
+                          fetch_list=[out])
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU()
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(num_parameters=self.x_np.shape[1])
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, self.x_np.shape[1], 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(init=0.5)
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.5)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(weight_attr=fluid.ParamAttr(name="weight"))
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.25)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        x = paddle.to_tensor(self.x_np)
+        m = paddle.nn.PReLU(weight_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(0.5)))
+        out = m(x)
+        out_ref = ref_prelu_nn(self.x_np, 1, 0.5)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        paddle.enable_static()
 
 
 class PReluTest(OpTest):
@@ -51,21 +162,22 @@ class PReluTest(OpTest):
         if self.attrs == {'mode': "all"}:
             alpha_np = np.random.uniform(-1, -0.5, (1))
         elif self.attrs == {'mode': "channel"}:
-            alpha_np = np.random.uniform(-1, -0.5, [1, self.x_shape[1]])
+            alpha_np = np.random.uniform(-1, -0.5, [1, self.x_shape[1], 1, 1])
         else:
             alpha_np = np.random.uniform(-1, -0.5, [1] + self.x_shape[1:])
 
         self.inputs = {'X': x_np, 'Alpha': alpha_np}
 
-        # NOTE(zhiqu): reshape inputs['Alpha'] from [1, 100] to [1, 100, 1, 1] since np operands could not be broadcast together with shapes (2,100,3,4) (1,100) 
+        # NOTE(zhiqu): reshape inputs['Alpha'] from [1, 100, 1, 1] to [1, 100] + [1]*len(x.shape[2:])
+        # since np operands could not be broadcast together with shapes (1,100,2,2,2,3) (1,100,1,1) 	
+        reshaped_alpha = self.inputs['Alpha']
         if self.attrs == {'mode': "channel"}:
-            self.inputs['Alpha'] = np.reshape(
+            reshaped_alpha = np.reshape(
                 self.inputs['Alpha'],
                 [1, self.x_shape[1]] + [1] * len(self.x_shape[2:]))
 
         out_np = np.maximum(self.inputs['X'], 0.)
-        out_np = out_np + np.minimum(self.inputs['X'],
-                                     0.) * self.inputs['Alpha']
+        out_np = out_np + np.minimum(self.inputs['X'], 0.) * reshaped_alpha
         assert out_np is not self.inputs['X']
         self.outputs = {'Out': out_np}
 
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..158683907253e2ebc5adab6799c75ffd914df1c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import unittest
+import numpy as np
+
+
+class TestProdOp(unittest.TestCase):
+    def setUp(self):
+        self.input = np.random.random(size=(10, 10, 5)).astype(np.float32)
+
+    def run_imperative(self):
+        input = paddle.to_tensor(self.input)
+        dy_result = paddle.prod(input)
+        expected_result = np.prod(self.input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1)
+        expected_result = np.prod(self.input, axis=1)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=-1)
+        expected_result = np.prod(self.input, axis=-1)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=[0, 1])
+        expected_result = np.prod(self.input, axis=(0, 1))
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, keepdim=True)
+        expected_result = np.prod(self.input, axis=1, keepdims=True)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, dtype='int64')
+        expected_result = np.prod(self.input, axis=1, dtype=np.int64)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+        dy_result = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
+        expected_result = np.prod(
+            self.input, axis=1, keepdims=True, dtype=np.int64)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+    def run_static(self, use_gpu=False):
+        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        result0 = paddle.prod(input)
+        result1 = paddle.prod(input, axis=1)
+        result2 = paddle.prod(input, axis=-1)
+        result3 = paddle.prod(input, axis=[0, 1])
+        result4 = paddle.prod(input, axis=1, keepdim=True)
+        result5 = paddle.prod(input, axis=1, dtype='int64')
+        result6 = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        static_result = exe.run(feed={"input": self.input},
+                                fetch_list=[
+                                    result0, result1, result2, result3, result4,
+                                    result5, result6
+                                ])
+
+        expected_result = np.prod(self.input)
+        self.assertTrue(np.allclose(static_result[0], expected_result))
+        expected_result = np.prod(self.input, axis=1)
+        self.assertTrue(np.allclose(static_result[1], expected_result))
+        expected_result = np.prod(self.input, axis=-1)
+        self.assertTrue(np.allclose(static_result[2], expected_result))
+        expected_result = np.prod(self.input, axis=(0, 1))
+        self.assertTrue(np.allclose(static_result[3], expected_result))
+        expected_result = np.prod(self.input, axis=1, keepdims=True)
+        self.assertTrue(np.allclose(static_result[4], expected_result))
+        expected_result = np.prod(self.input, axis=1, dtype=np.int64)
+        self.assertTrue(np.allclose(static_result[5], expected_result))
+        expected_result = np.prod(
+            self.input, axis=1, keepdims=True, dtype=np.int64)
+        self.assertTrue(np.allclose(static_result[6], expected_result))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static(use_gpu=True)
+
+
+class TestProdOpError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[2, 2, 4], dtype='float32')
+            bool_x = paddle.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            # The argument x shoule be a Tensor
+            self.assertRaises(TypeError, paddle.prod, [1])
+
+            # The data type of x should be float32, float64, int32, int64
+            self.assertRaises(TypeError, paddle.prod, bool_x)
+
+            # The argument axis's type shoule be int ,list or tuple
+            self.assertRaises(TypeError, paddle.prod, x, 1.5)
+
+            # The argument dtype of prod_op should be float32, float64, int32 or int64.
+            self.assertRaises(TypeError, paddle.prod, x, 'bool')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_query_op.py b/python/paddle/fluid/tests/unittests/test_query_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc8ce5ad5f6b89b28fb2ddddd15d5b315fe4c0e4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_query_op.py
@@ -0,0 +1,32 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+from paddle.fluid import core
+
+
+class TestCudnnVersion(unittest.TestCase):
+    def test_no_cudnn(self):
+        cudnn_version = paddle.get_cudnn_version()
+        if not core.is_compiled_with_cuda():
+            self.assertEqual((cudnn_version is None), True)
+        else:
+            self.assertEqual((isinstance(cudnn_version, int)), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 5b2d5be346a9b205cb44373f58a413baa6c8a2fa..88b07f5df83f8f967f8ba76e78b37ecfb2c54276 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -19,7 +19,7 @@ import numpy as np
 from op_test import OpTest
 import paddle
 from paddle.fluid import core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 
 
 def output_hist(out):
@@ -125,14 +125,14 @@ class TestRandintAPI(unittest.TestCase):
             out4 = paddle.randint(
                 low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
             # shape is a tensor and dtype is 'float64'
-            var_shape = paddle.nn.data(
+            var_shape = paddle.static.data(
                 name='var_shape', shape=[2], dtype="int64")
             out5 = paddle.randint(
                 low=1, high=1000, shape=var_shape, dtype='int64')
 
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             outs = exe.run(
                 feed={'var_shape': np.array([100, 100]).astype('int64')},
                 fetch_list=[out1, out2, out3, out4, out5])
@@ -141,13 +141,14 @@ class TestRandintAPI(unittest.TestCase):
 class TestRandintImperative(unittest.TestCase):
     def test_api(self):
         n = 10
-        with paddle.imperative.guard():
-            x1 = paddle.randint(n, shape=[10], dtype="int32")
-            x2 = paddle.tensor.randint(n)
-            x3 = paddle.tensor.random.randint(n)
-            for i in [x1, x2, x3]:
-                for j in i.numpy().tolist():
-                    self.assertTrue((j >= 0 and j < n))
+        paddle.disable_static()
+        x1 = paddle.randint(n, shape=[10], dtype="int32")
+        x2 = paddle.tensor.randint(n)
+        x3 = paddle.tensor.random.randint(n)
+        for i in [x1, x2, x3]:
+            for j in i.numpy().tolist():
+                self.assertTrue((j >= 0 and j < n))
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index f65cc6dc53b7e3541016447d8510bd3d38a53b17..9d2c03f3bba914d8f6b06b54ce0e19c168edb9e3 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 
 
 class TestRandnOp(unittest.TestCase):
@@ -34,12 +34,12 @@ class TestRandnOp(unittest.TestCase):
             dim_2 = paddle.fill_constant([1], "int32", 50)
             x3 = paddle.randn([dim_1, dim_2, 784])
 
-            var_shape = paddle.nn.data('X', [2], 'int32')
+            var_shape = paddle.static.data('X', [2], 'int32')
             x4 = paddle.randn(var_shape)
 
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
-        exe = paddle.Executor(place)
+        exe = paddle.static.Executor(place)
         res = exe.run(train_program,
                       feed={'X': np.array(
                           shape, dtype='int32')},
@@ -55,20 +55,21 @@ class TestRandnOpForDygraph(unittest.TestCase):
         shape = [1000, 784]
         place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
-        with paddle.imperative.guard(place):
-            x1 = paddle.randn(shape, 'float32')
-            x2 = paddle.randn(shape, 'float64')
+        paddle.disable_static(place)
+        x1 = paddle.randn(shape, 'float32')
+        x2 = paddle.randn(shape, 'float64')
 
-            dim_1 = paddle.fill_constant([1], "int64", 20)
-            dim_2 = paddle.fill_constant([1], "int32", 50)
-            x3 = paddle.randn(shape=[dim_1, dim_2, 784])
+        dim_1 = paddle.fill_constant([1], "int64", 20)
+        dim_2 = paddle.fill_constant([1], "int32", 50)
+        x3 = paddle.randn(shape=[dim_1, dim_2, 784])
 
-            var_shape = paddle.imperative.to_variable(np.array(shape))
-            x4 = paddle.randn(var_shape)
+        var_shape = paddle.to_variable(np.array(shape))
+        x4 = paddle.randn(var_shape)
 
-            for out in [x1, x2, x3, x4]:
-                self.assertAlmostEqual(np.mean(out.numpy()), .0, delta=0.1)
-                self.assertAlmostEqual(np.std(out.numpy()), 1., delta=0.1)
+        for out in [x1, x2, x3, x4]:
+            self.assertAlmostEqual(np.mean(out.numpy()), .0, delta=0.1)
+            self.assertAlmostEqual(np.std(out.numpy()), 1., delta=0.1)
+        paddle.enable_static()
 
 
 class TestRandnOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2933abe46c1b87959c9f61975c02a41e91dfbef3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -0,0 +1,463 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+
+import time  # temp for debug
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TestGeneratorSeed(unittest.TestCase):
+    """
+    Test cases for cpu generator seed.
+    """
+
+    def test_generator_uniform_random_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
+        st1 = gen.get_state()
+        x1 = fluid.layers.uniform_random(
+            [10], dtype="float32", min=0.0, max=1.0)
+        gen.set_state(st1)
+        x2 = fluid.layers.uniform_random(
+            [10], dtype="float32", min=0.0, max=1.0)
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.uniform_random(
+            [10], dtype="float32", min=0.0, max=1.0)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_uniform_random_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.uniform_random(shape=[3, 4])
+            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_gen_dropout_dygraph(self):
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(111111111)
+        st = gen.get_state()
+        # x = np.arange(1,101).reshape(2,50).astype("float32")
+        x = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        y = fluid.layers.dropout(x, 0.5)
+        gen.manual_seed(111111111)
+        #gen.set_state(st)
+        x1 = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        y1 = fluid.layers.dropout(x1, 0.5)
+        y_np = y.numpy()
+        y1_np = y1.numpy()
+        #print(y_np)
+        #print(y1_np)
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> dropout dygraph >>>>>>>")
+            self.assertTrue(np.allclose(y_np, y1_np))
+
+    def test_gen_dropout_static(self):
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x_1 = fluid.layers.uniform_random(shape=[2, 10])
+            y_1 = fluid.layers.dropout(x_1, 0.5)
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program, feed={}, fetch_list=[y_1])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program, feed={}, fetch_list=[y_1])
+        out1_np = np.array(out1[0])
+        out2_np = np.array(out2[0])
+        # print(out1_np)
+        # print(out2_np)
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> dropout static >>>>>>>")
+            self.assertTrue(np.allclose(out1_np, out2_np))
+
+    def test_generator_gaussian_random_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([10], dtype="float32")
+        st1 = gen.get_state()
+        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.set_state(st1)
+        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> gaussian random dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_gaussian_random_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = fluid.layers.gaussian_random(shape=[3, 4])
+            result_2 = fluid.layers.gaussian_random(shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> gaussian random static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = paddle.randint(low=10, shape=[10], dtype="int32")
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.set_state(st1)
+        x2 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.manual_seed(12312321111)
+        x3 = paddle.randint(low=10, shape=[10], dtype="int32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> randint dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_ranint_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = paddle.randint(low=10, shape=[3, 4])
+            result_2 = paddle.randint(low=10, shape=[3, 4])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> randint static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_randperm_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = paddle.randperm(10)
+        st1 = gen.get_state()
+        x1 = paddle.randperm(10)
+        gen.set_state(st1)
+        x2 = paddle.randperm(10)
+        gen.manual_seed(12312321111)
+        x3 = paddle.randperm(10)
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        # print("## {}".format(x1_np))
+        # print("## {}".format(x2_np))
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> randperm dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randperm_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            result_1 = paddle.randperm(10)
+            result_2 = paddle.randperm(10)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> randperm static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_generator_sampling_id_dygraph(self):
+        """Test Generator seed."""
+        gen = generator.Generator()
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(12312321111)
+        x = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y = fluid.layers.sampling_id(x)
+        st1 = gen.get_state()
+        x1 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y1 = fluid.layers.sampling_id(x)
+        gen.set_state(st1)
+        x2 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y2 = fluid.layers.sampling_id(x)
+        gen.manual_seed(12312321111)
+        x3 = fluid.layers.uniform_random(
+            [10, 10], dtype="float32", min=0.0, max=1.0)
+        y3 = fluid.layers.sampling_id(x)
+
+        x_np = y.numpy()
+        x1_np = y1.numpy()
+        x2_np = y2.numpy()
+        x3_np = y3.numpy()
+
+        print("## {}".format(x1_np))
+        print("## {}".format(x2_np))
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randperm_static(self):
+
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[10, 10])
+            result_1 = fluid.layers.sampling_id(x)
+            result_2 = fluid.layers.sampling_id(x)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+            gen.manual_seed(123123143)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+            out1_res1 = np.array(out1[0])
+            out1_res2 = np.array(out1[1])
+            out2_res1 = np.array(out2[0])
+            out2_res2 = np.array(out2[1])
+
+            if not core.is_compiled_with_cuda():
+                print(">>>>>>> sampling id static >>>>>>>")
+                self.assertTrue(np.allclose(out1_res1, out2_res1))
+                self.assertTrue(np.allclose(out1_res2, out2_res2))
+                self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+    def test_gen_TruncatedNormal_initializer(self):
+        fluid.disable_dygraph()
+
+        gen = generator.Generator()
+        gen.manual_seed(123123143)
+        cur_state = gen.get_state()
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[2, 10])
+            result_1 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+            result_2 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+            #gen.set_state(cur_state)
+
+        #gen.set_state(cur_state)    
+        gen.manual_seed(123123143)
+        with fluid.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        print(out1_res1)
+        print(out1_res2)
+        print(out2_res1)
+        print(out2_res2)
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id static >>>>>>>")
+            self.assertTrue(np.allclose(out1_res1, out2_res1))
+            self.assertTrue(np.allclose(out1_res2, out2_res2))
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 6938b8ef1e051777c867796062e5e7cbed6d7fa4..4361a45f1568f5f047ee03090bd3ef28a8d6654f 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -17,7 +17,7 @@ import numpy as np
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
-from paddle import Program, program_guard
+from paddle.static import program_guard, Program
 
 
 def check_randperm_out(n, data_np):
@@ -108,7 +108,7 @@ class TestRandpermAPI(unittest.TestCase):
             x1 = paddle.randperm(n)
             x2 = paddle.randperm(n, 'float32')
 
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             res = exe.run(fetch_list=[x1, x2])
 
             self.assertEqual(res[0].dtype, np.int64)
@@ -119,13 +119,14 @@ class TestRandpermAPI(unittest.TestCase):
 
 class TestRandpermImperative(unittest.TestCase):
     def test_out(self):
-        with paddle.imperative.guard():
-            n = 10
-            for dtype in ['int32', np.int64, 'float32', 'float64']:
-                data_p = paddle.randperm(n, dtype)
-                data_np = data_p.numpy()
-                self.assertTrue(
-                    check_randperm_out(n, data_np), msg=error_msg(data_np))
+        paddle.disable_static()
+        n = 10
+        for dtype in ['int32', np.int64, 'float32', 'float64']:
+            data_p = paddle.randperm(n, dtype)
+            data_np = data_p.numpy()
+            self.assertTrue(
+                check_randperm_out(n, data_np), msg=error_msg(data_np))
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 16874d80112bbd537718b20fd4dec3701ea3b75d..cf35f9dbcdaaae1357ccdfd6b5cba85ac98d2037 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,22 +67,6 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestMeanOp(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -318,21 +302,6 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
-## reduction in multi dims
-class TestReduceMeanOpMultiAxises(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': [1, 2]}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -420,40 +389,6 @@ class TestReduceSumWithNumelOne(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestReduceMeanWithDimOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((100, 1, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': False}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=False)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestReduceMeanWithNumelOne(OpTest):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((100, 1)).astype("float64")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=True)
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 class TestReduceAll(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -536,18 +471,6 @@ class TestReduceSumOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.reduce_sum, x2)
 
 
-class TestReduceMeanOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of reduce_mean_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.reduce_mean, x1)
-            # The input dtype of reduce_mean_op  must be float32 or float64 or int32 or int64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
-            self.assertRaises(TypeError, fluid.layers.reduce_mean, x2)
-
-
 class API_TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_dtype1():
@@ -580,10 +503,10 @@ class API_TestSumOpError(unittest.TestCase):
 
 
 class API_TestSumOp(unittest.TestCase):
-    def test_1(self):
+    def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_sum = paddle.sum(input=data, dim=1, dtype="float64")
+            result_sum = paddle.sum(x=data, axis=1, dtype="float64")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.rand(10, 10).astype(np.float32)
@@ -593,7 +516,7 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1, dtype="int64")
+            result_sum = paddle.sum(x=data, axis=1, dtype="int64")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
@@ -603,7 +526,7 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1)
+            result_sum = paddle.sum(x=data, axis=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
@@ -612,84 +535,41 @@ class API_TestSumOp(unittest.TestCase):
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(input=data, dim=1)
+            result_sum = paddle.sum(x=data, axis=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
             res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
         self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
 
-        with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            z = paddle.sum(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.sum(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
-
-
-class API_TestMaxOp(unittest.TestCase):
-    def test_1(self):
-        # type: float
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_max = paddle.max(input=data, dim=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
-        self.assertEqual((res == np.max(input_data, axis=1)).all(), True)
-
-        # type: int
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int64")
-            result_max = paddle.max(input=data, dim=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_max])
-        self.assertEqual((res == np.max(input_data, axis=1)).all(), True)
-
-        # dygraph
-        with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            z = paddle.max(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.max(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
-
+            input_data = np.random.randint(10, size=(5, 5, 5)).astype(np.int32)
+            data = fluid.data("data", shape=[5, 5, 5], dtype="int32")
+            sum1 = paddle.sum(x=data, axis=[0, 1])
+            sum2 = paddle.sum(x=data, axis=())
 
-class API_TestMinOp(unittest.TestCase):
-    def test_1(self):
-        # type: float
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_min = paddle.min(input=data, dim=1)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
-        self.assertEqual((res == np.min(input_data, axis=1)).all(), True)
+            res1, res2 = exe.run(feed={"data": input_data},
+                                 fetch_list=[sum1, sum2])
 
-        # type: int
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int64")
-            result_min = paddle.min(input=data, dim=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int64)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_min])
-        self.assertEqual((res == np.min(input_data, axis=1)).all(), True)
+        self.assertEqual((res1 == np.sum(input_data, axis=(0, 1))).all(), True)
+        self.assertEqual(
+            (res2 == np.sum(input_data, axis=(0, 1, 2))).all(), True)
 
-        # dygraph
+    def test_dygraph(self):
+        np_x = np.random.random([2, 3, 4]).astype('int32')
         with fluid.dygraph.guard():
-            np_x = np.array([10, 10]).astype('float64')
             x = fluid.dygraph.to_variable(np_x)
-            z = paddle.min(x, dim=0)
-            np_z = z.numpy()
-            z_expected = np.array(np.min(np_x, axis=0))
-        self.assertEqual((np_z == z_expected).all(), True)
+            out0 = paddle.sum(x).numpy()
+            out1 = paddle.sum(x, axis=0).numpy()
+            out2 = paddle.sum(x, axis=(0, 1)).numpy()
+            out3 = paddle.sum(x, axis=(0, 1, 2)).numpy()
+
+        self.assertTrue((out0 == np.sum(np_x, axis=(0, 1, 2))).all())
+        self.assertTrue((out1 == np.sum(np_x, axis=0)).all())
+        self.assertTrue((out2 == np.sum(np_x, axis=(0, 1))).all())
+        self.assertTrue((out3 == np.sum(np_x, axis=(0, 1, 2))).all())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 3dfd9023f5af30ff289c4dc55a0c275402bc3067..275f9d21f9f8eca653a030bfe5c74071397f33c1 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -227,35 +228,43 @@ class TestReshapeUint8Op(TestReshapeInt8Op):
 
 # Test python API
 class TestReshapeAPI(unittest.TestCase):
-    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-    def test_1(self):
+    def _set_paddle_api(self):
+        self.fill_constant = paddle.fill_constant
+        self.data = paddle.data
+        self.reshape = paddle.reshape
+        self.to_tensor = paddle.to_tensor
+
+    def _set_fluid_api(self):
+        self.fill_constant = fluid.layers.fill_constant
+        self.data = fluid.data
+        self.reshape = fluid.layers.reshape
+
+    def _test_api(self):
         input = np.random.random([2, 25]).astype("float32")
         shape = [2, 5, 5]
-        positive_five = fluid.layers.fill_constant([1], "int32", 5)
-        x = fluid.layers.data(
-            name="x", shape=[2, 25], append_batch_size=False, dtype="float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            positive_five = self.fill_constant([1], "int32", 5)
+            x = self.data(name="x", shape=[2, 25], dtype="float32")
 
-        actual_shape = fluid.layers.data(
-            name="shape",
-            shape=[1, 3],
-            append_batch_size=False,
-            dtype="float32")
+            actual_shape = self.data(name="shape", shape=[3], dtype="int32")
 
-        # situation 1: have shape( list, no tensor), no actual shape(Tensor)
-        out_1 = fluid.layers.reshape(x, shape)
+            # situation 1: have shape( list, no tensor), no actual shape(Tensor)
+            out_1 = self.reshape(x, shape)
 
-        # situation 2: have shape(list, no tensor), have actual shape(Tensor)
-        out_2 = fluid.layers.reshape(x, shape=shape, actual_shape=actual_shape)
+            # situation 2: have shape(list, no tensor), have actual shape(Tensor)
+            out_2 = fluid.layers.reshape(
+                x, shape=shape, actual_shape=actual_shape)
 
-        # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-        out_3 = fluid.layers.reshape(x, shape=[positive_five, 10])
+            # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+            out_3 = self.reshape(x, shape=[positive_five, 10])
 
-        # Situation 4: have shape(Tensor), no actual shape(Tensor)
-        out_4 = fluid.layers.reshape(x, shape=actual_shape)
+            # Situation 4: have shape(Tensor), no actual shape(Tensor)
+            out_4 = self.reshape(x, shape=actual_shape)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4 = exe.run(
-            fluid.default_main_program(),
+            main_prog,
             feed={"x": input,
                   "shape": np.array([2, 5, 5]).astype("int32")},
             fetch_list=[out_1, out_2, out_3, out_4])
@@ -265,76 +274,108 @@ class TestReshapeAPI(unittest.TestCase):
         assert np.array_equal(res_3, input.reshape([5, 10]))
         assert np.array_equal(res_4, input.reshape(shape))
 
+    def test_paddle_api(self):
+        self._set_paddle_api()
+        self._test_api()
+
+    def test_fluid_api(self):
+        self._set_fluid_api()
+        self._test_api()
+
+    def test_imperative(self):
+        self._set_paddle_api()
+        input = np.random.random([2, 25]).astype("float32")
+        shape = [2, 5, 5]
+        with fluid.dygraph.guard():
+            x = self.to_tensor(input)
+            positive_five = self.fill_constant([1], "int32", 5)
+
+            out_1 = self.reshape(x, shape)
+
+            out_2 = self.reshape(x, shape=[positive_five, 10])
+
+            shape_tensor = self.to_tensor(np.array([2, 5, 5]).astype("int32"))
+            out_3 = self.reshape(x, shape=shape_tensor)
+
+        assert np.array_equal(out_1.numpy(), input.reshape(shape))
+        assert np.array_equal(out_2.numpy(), input.reshape([5, 10]))
+        assert np.array_equal(out_3.numpy(), input.reshape(shape))
+
 
 # Test Input Error
 class TestReshapeOpError(unittest.TestCase):
-    def test_errors(self):
+    def _set_paddle_api(self):
+        self.data = paddle.data
+        self.reshape = paddle.reshape
+
+    def _set_fluid_api(self):
+        self.data = fluid.data
+        self.reshape = fluid.layers.reshape
+
+    def _test_errors(self):
         with program_guard(Program(), Program()):
             # The x type of reshape_op must be Variable.
             def test_x_type():
                 x1 = fluid.create_lod_tensor(
                     np.array([[-1]]), [[1]], fluid.CPUPlace())
-                fluid.layers.reshape(x1, shape=[1])
+                self.reshape(x1, shape=[1])
 
             self.assertRaises(TypeError, test_x_type)
 
             # The x dtype of reshape_op must be float16, float32, float64, int32 or int64.
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name="x2",
-                    shape=[2, 25],
-                    append_batch_size=False,
-                    dtype="bool")
-                fluid.layers.reshape(x2, shape=[2, 5, 5])
+                x2 = self.data(name="x2", shape=[2, 25], dtype="bool")
+                self.reshape(x2, shape=[2, 5, 5])
 
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_x_dtype_float16():
-                x_float16 = fluid.layers.data(
-                    name="x_float16",
-                    shape=[2, 25],
-                    append_batch_size=False,
-                    dtype="float16")
-                fluid.layers.reshape(x_float16, shape=[2, 5, 5])
+                x_float16 = self.data(
+                    name="x_float16", shape=[2, 25], dtype="float16")
+                self.reshape(x_float16, shape=[2, 5, 5])
 
             test_x_dtype_float16()
 
-            x3 = fluid.layers.data(
-                name="x3",
-                shape=[2, 25],
-                append_batch_size=False,
-                dtype="float32")
+            x3 = self.data(name="x3", shape=[2, 25], dtype="float32")
 
             # The argument shape's type of reshape_op must be list, tuple or Variable.
             def test_shape_type():
-                fluid.layers.reshape(x3, shape=1)
+                self.reshape(x3, shape=1)
 
             self.assertRaises(TypeError, test_shape_type)
 
             # The argument actual_shape's type of reshape_op must be Variable or None.
             def test_actual_shape_type():
-                fluid.layers.reshape(x3, shape=[25, 2], actual_shape=1)
+                self.reshape(x3, shape=[25, 2], actual_shape=1)
 
             self.assertRaises(TypeError, test_actual_shape_type)
 
             # The argument shape have more than one -1.
             def test_shape_1():
-                fluid.layers.reshape(x3, shape=[-1, -1, 5])
+                self.reshape(x3, shape=[-1, -1, 5])
 
             self.assertRaises(AssertionError, test_shape_1)
 
             # The argument shape have element 0 whose index exceed the input dimension.
             def test_shape_2():
-                fluid.layers.reshape(x3, [2, 5, 5, 0])
+                self.reshape(x3, [2, 5, 5, 0])
 
             self.assertRaises(AssertionError, test_shape_2)
 
             # The argument shape have more than one negative value.
             def test_shape_3():
-                fluid.layers.reshape(x3, [-1, -2, 5])
+                self.reshape(x3, [-1, -2, 5])
 
             self.assertRaises(AssertionError, test_shape_3)
 
+    def test_paddle_api_error(self):
+        self._set_paddle_api()
+        self._test_errors()
+
+    def test_fluid_api_error(self):
+        self._set_fluid_api()
+        self._test_errors()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index bc50cf197f63e6082ea1d3fdbff1891f500e5b9a..9abbee173852baf9db998aad3b71edabdb3e11ed 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -17,16 +17,16 @@ import paddle
 import paddle.fluid as fluid
 import unittest
 
-paddle.enable_imperative()
+paddle.disable_static()
 SEED = 2020
 np.random.seed(SEED)
-fluid.default_main_program().random_seed = SEED
+paddle.manual_seed(SEED)
 
 
 class Generator(fluid.dygraph.Layer):
     def __init__(self):
         super(Generator, self).__init__()
-        self.conv1 = paddle.nn.Conv2D(3, 3, 3, 1)
+        self.conv1 = paddle.nn.Conv2d(3, 3, 3, padding=1)
 
     def forward(self, x):
         x = self.conv1(x)
@@ -37,7 +37,7 @@ class Generator(fluid.dygraph.Layer):
 class Discriminator(fluid.dygraph.Layer):
     def __init__(self):
         super(Discriminator, self).__init__()
-        self.convd = paddle.nn.Conv2D(6, 3, 1)
+        self.convd = paddle.nn.Conv2d(6, 3, 1)
 
     def forward(self, x):
         x = self.convd(x)
@@ -60,8 +60,10 @@ class TestRetainGraph(unittest.TestCase):
                 interpolatesv = fake_data
             elif type == 'mixed':
                 alpha = paddle.rand((real_data.shape[0], 1))
-                alpha = paddle.expand(
-                    alpha, [1, np.prod(real_data.shape) // real_data.shape[0]])
+                alpha = paddle.expand(alpha, [
+                    real_data.shape[0],
+                    np.prod(real_data.shape) // real_data.shape[0]
+                ])
                 alpha = paddle.reshape(alpha, real_data.shape)
                 interpolatesv = alpha * real_data + ((1 - alpha) * fake_data)
             else:
@@ -73,7 +75,7 @@ class TestRetainGraph(unittest.TestCase):
 
             outs = paddle.fill_constant(disc_interpolates.shape,
                                         disc_interpolates.dtype, 1.0)
-            gradients = paddle.imperative.grad(
+            gradients = paddle.grad(
                 outputs=disc_interpolates,
                 inputs=fake_AB,
                 grad_outputs=outs,
@@ -90,12 +92,12 @@ class TestRetainGraph(unittest.TestCase):
         else:
             return 0.0, None
 
-    def test_retain(self):
+    def run_retain(self, need_retain):
         g = Generator()
         d = Discriminator()
 
-        optim_g = paddle.optimizer.Adam(parameter_list=g.parameters())
-        optim_d = paddle.optimizer.Adam(parameter_list=d.parameters())
+        optim_g = paddle.optimizer.Adam(parameters=g.parameters())
+        optim_d = paddle.optimizer.Adam(parameters=d.parameters())
 
         gan_criterion = paddle.nn.MSELoss()
         l1_criterion = paddle.nn.L1Loss()
@@ -103,8 +105,8 @@ class TestRetainGraph(unittest.TestCase):
         A = np.random.rand(2, 3, 32, 32).astype('float32')
         B = np.random.rand(2, 3, 32, 32).astype('float32')
 
-        realA = paddle.imperative.to_variable(A)
-        realB = paddle.imperative.to_variable(B)
+        realA = paddle.to_variable(A)
+        realB = paddle.to_variable(B)
         fakeB = g(realA)
 
         optim_d.clear_gradients()
@@ -117,7 +119,7 @@ class TestRetainGraph(unittest.TestCase):
             d, realA, fakeB, lambda_gp=10.0)
         loss_d = gan_criterion(G_pred_fake, false_target) + G_gradient_penalty
 
-        loss_d.backward(retain_graph=True)
+        loss_d.backward(retain_graph=need_retain)
         optim_d.minimize(loss_d)
 
         optim_g.clear_gradients()
@@ -130,6 +132,11 @@ class TestRetainGraph(unittest.TestCase):
         loss_g.backward()
         optim_g.minimize(loss_g)
 
+    def test_retain(self):
+        self.run_retain(need_retain=True)
+        self.assertRaises(
+            fluid.core.EnforceNotMet, self.run_retain, need_retain=False)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index eb12bc741767340a3e7e3580a8b95065d4267693..0f225758ced3bf7d6fd821be09f2dbf11ff1cc6d 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
+import paddle
 
 
 def create_selected_rows_and_tensor(scope, place, height, row_num,
@@ -222,5 +223,59 @@ class TestRmspropOp(TestBase):
                         size=size)
 
 
+class TestRMSPropV2(unittest.TestCase):
+    def test_rmsprop_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5, dtype="float32")
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.RMSProp(
+            learning_rate=0.01,
+            parameters=linear.parameters(),
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+    def test_rmsprop(self):
+        place = fluid.CPUPlace()
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+
+            rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
+            rms_optimizer.minimize(avg_cost)
+
+            fetch_list = [avg_cost]
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1)
+            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for data in train_reader():
+                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    def test_raise_error(self):
+        self.assertRaises(ValueError, paddle.optimizer.RMSProp, None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.RMSProp,
+            learning_rate=0.1,
+            epsilon=None)
+        self.assertRaises(
+            ValueError,
+            paddle.optimizer.RMSProp,
+            learning_rate=0.1,
+            momentum=None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 6ca194b2694b6c7537ceb94e11eb1a1a0aeb8d8d..7e2ef36c1a7fda5c31049ec9c752c5226bfb89dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,7 +248,8 @@ class PolicyGradient(object):
             func=reward_func, x=[action, length], out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
-        cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
+        cost = (layers.reduce_sum(cost) /
+                layers.cast(layers.reduce_sum(length), "float32")
                 ) if length is not None else layers.reduce_mean(cost)
         optimizer = fluid.optimizer.Adam(self.lr)
         optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index 5e9c67c1a7a29b69a977cc94487fc3d26f24eeb8..ce3b060828ac475a10d61bf756423069ab0a70c1 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 from op_test import OpTest
 import paddle.fluid.core as core
 
@@ -173,5 +175,55 @@ class TestScatterOp5(OpTest):
             self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
 
 
+class TestScatterAPI(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[3, 2], dtype="float64")
+            index = fluid.data(name="index", shape=[4], dtype="int64")
+            updates = fluid.data(name="updates", shape=[4, 2], dtype="float64")
+            result = paddle.scatter(input, index, updates, False)
+
+            input_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array(
+                [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input": input_data,
+                                  "index": index_data,
+                                  "updates": updates_data
+                              },
+                              fetch_list=[result])
+            self.assertEqual((fetches[0] == \
+                              np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
+                index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+                updates_data = np.array(
+                    [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+
+                x = fluid.dygraph.to_variable(x_data)
+                index = fluid.dygraph.to_variable(index_data)
+                updates = fluid.dygraph.to_variable(updates_data)
+
+                output1 = paddle.scatter(x, index, updates, overwrite=False)
+                self.assertEqual((output1.numpy() == \
+                                  np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 6070c84ff236274cc1778d0dce9ab40d884ce7ec..590ef11e9cb5de7414ff8745b719e3ffb4e044d8 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -17,9 +17,26 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import six
+import paddle.fluid.core as core
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+def ref_selu(x,
+             scale=1.0507009873554804934193349852946,
+             alpha=1.6732632423543772848170429916717):
+    out = np.copy(x)
+    out_flat = out.flatten()
+    for i in range(out_flat.size):
+        if out_flat[i] < 0:
+            out_flat[i] = alpha * np.exp(out_flat[i]) - alpha
+        out_flat[i] = scale * out_flat[i]
+    out = out_flat.reshape(x.shape)
+    return out
 
 
 class SeluTest(OpTest):
@@ -39,17 +56,10 @@ class SeluTest(OpTest):
         # zero.
         x[np.abs(x) < 0.005] = 0.02
 
-        x_flat = x.flatten()
-
-        for i in range(x_flat.size):
-            if x_flat[i] < 0:
-                x_flat[i] = alpha * np.exp(x_flat[i]) - alpha
-            x_flat[i] = scale * x_flat[i]
-
-        out_np = x_flat.reshape(self.x_shape)
+        out = ref_selu(x, scale, alpha)
 
         self.inputs = {'X': x}
-        self.outputs = {'Out': out_np}
+        self.outputs = {'Out': out}
 
         self.attrs = {
             'alpha': alpha,
@@ -69,17 +79,60 @@ class SeluTest(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestSeluOpError(unittest.TestCase):
+class TestSeluAPI(unittest.TestCase):
+    # test paddle.nn.SELU, paddle.nn.functional.selu
+    def setUp(self):
+        self.scale = 1.5
+        self.alpha = 2.0
+        self.x_np = np.random.normal(size=[3, 5, 5, 10]).astype(np.float64)
+        # Since zero point in selu is not differentiable, avoid randomize
+        # zero.
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.selu(x, self.scale, self.alpha)
+            selu = paddle.nn.SELU(self.scale, self.alpha)
+            out2 = selu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.selu(x, self.scale, self.alpha)
+        selu = paddle.nn.SELU(self.scale, self.alpha)
+        out2 = selu(x)
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.selu(x, self.scale, self.alpha)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_selu(self.x_np, self.scale, self.alpha)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.selu, 1)
+            self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.selu, x_int32)
-            # support the input dtype is float32
-            x_fp32 = fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
-            fluid.layers.selu(x_fp32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.selu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.selu(x_fp16)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index b84e3b5377f2796803707dfd68cd5450c512fce7..da5080eabddc93f0c3d08f16e0a7c20b52af47e0 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -54,5 +55,32 @@ class TestSignOpError(unittest.TestCase):
             fluid.layers.sign(input4)
 
 
+class TestSignAPI(unittest.TestCase):
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([-1., 0., -0., 1.2, 1.5], dtype='float64')
+            x = paddle.to_tensor(np_x)
+            z = paddle.sign(x)
+            np_z = z.numpy()
+            z_expected = np.sign(np_x)
+            self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_static(self):
+        with program_guard(Program(), Program()):
+            # The input type of sign_op must be Variable or numpy.ndarray.
+            input1 = 12
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
+            # The input dtype of sign_op must be float16, float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[12, 10], dtype="int32")
+            input3 = fluid.layers.data(
+                name='input3', shape=[12, 10], dtype="int64")
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input2)
+            self.assertRaises(TypeError, paddle.tensor.math.sign, input3)
+            input4 = fluid.layers.data(
+                name='input4', shape=[4], dtype="float16")
+            paddle.sign(input4)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 5ccc667799320d1ef7704b1e4416d9685ec6ecd2..fdcd2d350a6fac115086b5677a972f8b1145ff95 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -663,5 +663,21 @@ class TestImperativeVarBaseGetItem(unittest.TestCase):
         self.assertRaises(Exception, test_float_in_index)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestImperativeCUDAPinnedInput(unittest.TestCase):
+    def test_input_cuda_pinned_var(self):
+        with fluid.dygraph.guard():
+            data = np.random.random((2, 80, 16128)).astype('float32')
+            var = core.VarBase(
+                value=data,
+                name='',
+                persistable=False,
+                place=fluid.CUDAPinnedPlace(),
+                zero_copy=False)
+            sliced = var[:, 10:, :var.shape[1]]
+            self.assertEqual(sliced.shape, [2, 70, 80])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a97f57aaae5f290b20e34242b1b43e5e352223d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+def smooth_l1_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+def smooth_l1_loss_np(input, label, reduction='mean', delta=1.0):
+    diff = input - label
+    out = np.vectorize(smooth_l1_loss_forward)(diff, delta)
+    if reduction == 'sum':
+        return np.sum(out)
+    elif reduction == 'mean':
+        return np.mean(out)
+    elif reduction == 'none':
+        return out
+
+
+class SmoothL1Loss(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+
+    def test_smooth_l1_loss_mean(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='mean')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_sum(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='sum')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_none(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, reduction='none')
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    def test_smooth_l1_loss_delta(self):
+        input_np = np.random.random([100, 200]).astype(np.float32)
+        label_np = np.random.random([100, 200]).astype(np.float32)
+        delta = np.random.rand()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[100, 200], dtype='float32')
+            label = fluid.data(name='label', shape=[100, 200], dtype='float32')
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
+            ret = smooth_l1_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
+            dy_ret = smooth_l1_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = smooth_l1_loss_np(input_np, label_np, delta=delta)
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 1df50d63e3f67424ed1f42b94c317030ed69c6e9..04d5cc941a4636da0352fe9221cdad8bdfcd2bd9 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -20,6 +20,10 @@ from op_test import OpTest
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+
+np.random.seed(10)
 
 
 def stable_softmax(x):
@@ -31,6 +35,15 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
 class TestSoftmaxOp(OpTest):
     def get_x_shape(self):
         return [10, 10]
@@ -89,20 +102,6 @@ class TestSoftmaxOp(OpTest):
                 check_dygraph=(self.use_mkldnn == False))
 
 
-class TestSoftmaxOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            self.assertRaises(TypeError, fluid.layers.softmax, x1)
-            # The input dtype of softmax_op must be float16, float32 or float64.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.softmax, x2)
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="float16")
-            fluid.layers.softmax(x3)
-
-
 class TestSoftmaxOp2(TestSoftmaxOp):
     def get_x_shape(self):
         return [2, 3, 4, 5]
@@ -220,5 +219,60 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
+class TestSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32')
+        self.out_ref = np.apply_along_axis(stable_softmax, -1, self.x_np)
+
+    def test_static_check(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, 'float32')
+            out1 = F.softmax(x)
+            m = paddle.nn.Softmax()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=None)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_check(self):
+        paddle.disable_static(self.place)
+
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.softmax(x)
+        m = paddle.nn.Softmax()
+        out2 = m(x)
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=None)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out1 = F.softmax(x, axis=0)
+        m = paddle.nn.Softmax(axis=0)
+        out2 = m(x)
+        out_ref = ref_softmax(self.x_np, axis=0, dtype=None)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+
+        out = F.softmax(x, dtype=np.float64)
+        out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float64)
+        self.assertEqual(np.allclose(out_ref, out.numpy()), True)
+
+        paddle.enable_static()
+
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.softmax, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, F.softmax, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            F.softmax(x_fp16)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index 087586aa89607a58493c2d4427cbb6d30b31f0da..015b72fd1c5275f758a109451110f61b97c4a0c7 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import paddle
 import paddle.fluid as fluid
-import paddle.imperative as imperative
 import paddle.fluid.layers as layers
 import numpy as np
 import six
@@ -72,16 +71,17 @@ class TestSortDygraph(unittest.TestCase):
             self.place = core.CPUPlace()
 
     def test_api_0(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.sort(var_x)
-            self.assertEqual((np.sort(self.input_data) == out.numpy()).all(),
-                             True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.sort(var_x)
+        self.assertEqual((np.sort(self.input_data) == out.numpy()).all(), True)
+        paddle.enable_static()
 
     def test_api_1(self):
-        with imperative.guard(self.place):
-            var_x = imperative.to_variable(self.input_data)
-            out = paddle.sort(var_x, axis=-1)
-            self.assertEqual(
-                (np.sort(
-                    self.input_data, axis=-1) == out.numpy()).all(), True)
+        paddle.disable_static(self.place)
+        var_x = paddle.to_variable(self.input_data)
+        out = paddle.sort(var_x, axis=-1)
+        self.assertEqual(
+            (np.sort(
+                self.input_data, axis=-1) == out.numpy()).all(), True)
+        paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_std_layer.py b/python/paddle/fluid/tests/unittests/test_std_layer.py
index d1e0056304204bf0dbe47982bbf4b9574acf8eac..e455151481443c1fb918efd9e44444536adc6b7f 100644
--- a/python/paddle/fluid/tests/unittests/test_std_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_std_layer.py
@@ -15,65 +15,104 @@
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 
 
-class TestStdLayer(unittest.TestCase):
+def ref_std(x, axis=None, unbiased=True, keepdim=False):
+    ddof = 1 if unbiased else 0
+    if isinstance(axis, int):
+        axis = (axis, )
+    if axis is not None:
+        axis = tuple(axis)
+    return np.std(x, axis=axis, ddof=ddof, keepdims=keepdim)
+
+
+class TestStdAPI(unittest.TestCase):
     def setUp(self):
-        self._dtype = "float64"
-        self._input = np.random.random([2, 3, 4, 5]).astype(self._dtype)
-
-    def static(self, axis=None, keepdim=False, unbiased=True):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            data = fluid.data(
-                name="data", dtype=self._dtype, shape=[None, 3, 4, 5])
-            out = prog.current_block().create_var(
-                dtype=self._dtype, shape=[2, 3, 4, 5])
-            paddle.std(input=data,
-                       axis=axis,
-                       keepdim=keepdim,
-                       unbiased=unbiased,
-                       out=out)
-
-        exe = fluid.Executor(self._place)
-        return exe.run(feed={"data": self._input},
-                       program=prog,
-                       fetch_list=[out])[0]
-
-    def dynamic(self, axis=None, keepdim=False, unbiased=True):
-        with fluid.dygraph.guard(self._place):
-            data = fluid.dygraph.to_variable(self._input)
-            out = paddle.std(input=data,
-                             axis=axis,
-                             keepdim=keepdim,
-                             unbiased=unbiased)
-            return out.numpy()
-
-    def numpy(self, axis=None, keepdim=False, unbiased=True):
-        ddof = 1 if unbiased else 0
-        axis = tuple(axis) if isinstance(axis, list) else axis
-        return np.std(self._input, axis=axis, keepdims=keepdim, ddof=ddof)
-
-    def test_equal(self):
-        places = []
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self._place = place
-            self.assertTrue(np.allclose(self.numpy(), self.static()))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(axis=[0, 2]), self.dynamic(axis=[0, 2])))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(
-                        axis=[1, 3], keepdim=True),
-                    self.dynamic(
-                        axis=[1, 3], keepdim=True)))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(unbiased=False), self.dynamic(unbiased=False)))
+        self.dtype = 'float64'
+        self.shape = [1, 3, 4, 10]
+        self.axis = [1, 3]
+        self.keepdim = False
+        self.unbiased = True
+        self.set_attrs()
+        self.x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        pass
+
+    def static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape, self.dtype)
+            out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        return res[0]
+
+    def dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
+        paddle.enable_static()
+        return out.numpy()
+
+    def test_api(self):
+        out_ref = ref_std(self.x, self.axis, self.unbiased, self.keepdim)
+        out_dygraph = self.dygraph()
+        out_static = self.static()
+        for out in [out_dygraph, out_static]:
+            self.assertTrue(np.allclose(out_ref, out))
+            self.assertTrue(np.equal(out_ref.shape, out.shape).all())
+
+
+class TestStdAPI_dtype(TestStdAPI):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestStdAPI_axis_int(TestStdAPI):
+    def set_attrs(self):
+        self.axis = 2
+
+
+class TestStdAPI_axis_list(TestStdAPI):
+    def set_attrs(self):
+        self.axis = [1, 2]
+
+
+class TestStdAPI_axis_tuple(TestStdAPI):
+    def set_attrs(self):
+        self.axis = (1, 3)
+
+
+class TestStdAPI_keepdim(TestStdAPI):
+    def set_attrs(self):
+        self.keepdim = False
+
+
+class TestStdAPI_unbiased(TestStdAPI):
+    def set_attrs(self):
+        self.unbiased = False
+
+
+class TestStdAPI_alias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([10, 12], 'float32'))
+        out1 = paddle.std(x).numpy()
+        out2 = paddle.tensor.std(x).numpy()
+        out3 = paddle.tensor.stat.std(x).numpy()
+        self.assertTrue(np.allclose(out1, out2))
+        self.assertTrue(np.allclose(out1, out3))
+        paddle.enable_static()
+
+
+class TestStdError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [2, 3, 4], 'int32')
+            self.assertRaises(TypeError, paddle.std, x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 8fd118c0193035fce294aa6ac23951d57ba43f78..b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -22,9 +22,11 @@ import unittest
 import numpy as np
 import os
 import six
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
+from paddle.fluid import Program, program_guard
 
 from op_test import OpTest, _set_use_system_allocator
 
@@ -202,5 +204,22 @@ class TestFP16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
         self.atol = 1e-2
 
 
+class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
+    def test_errors(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
+            self.assertRaises(TypeError, my_sync_batch_norm, x1)
+
+            # the input dtype of SyncBatchNorm must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
+            self.assertRaises(TypeError, my_sync_batch_norm, x2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aaf31993448ab0ff0c69f648cfa84c62d3e198b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -0,0 +1,251 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+# Situation 1: repeat_times is a list (without tensor)
+class TestTileOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.attrs = {'repeat_times': self.repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+# with dimension expanding
+class TestTileOpRank2Expanding(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.repeat_times = [2, 2]
+
+
+class TestTileOpRank2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+class TestTileOpRank3_Corner(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (1, 1, 1)
+
+
+class TestTileOpRank3_Corner2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (2, 2)
+
+
+class TestTileOpRank3(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 15)
+        self.repeat_times = (2, 1, 4)
+
+
+class TestTileOpRank4(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.repeat_times = (3, 2, 1, 2)
+
+
+# Situation 2: repeat_times is a list (with tensor)
+class TestTileOpRank1_tensor_attr(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.init_data()
+        repeat_times_tensor = []
+        for index, ele in enumerate(self.repeat_times):
+            repeat_times_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'repeat_times_tensor': repeat_times_tensor,
+        }
+        self.attrs = {"repeat_times": self.infer_repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+        self.infer_repeat_times = [-1]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [1, 1]
+        self.infer_repeat_times = [1, -1]
+
+
+class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+        self.infer_repeat_times = [-1, 3]
+
+
+# Situation 3: repeat_times is a tensor
+class TestTileOpRank1_tensor(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float64"),
+            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+# Situation 4: input x is Integer
+class TestTileOpInteger(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(4, 4, 5)).astype("int32")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 5: input x is Bool
+class TestTileOpBoolean(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+# Situation 56: input x is Integer
+class TestTileOpInt64_t(OpTest):
+    def setUp(self):
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int64")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTileError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            repeat_times = [2, 2]
+            self.assertRaises(TypeError, paddle.tile, x1, repeat_times)
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
+            self.assertRaises(TypeError, paddle.tile, x2, repeat_times)
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool")
+            x3.stop_gradient = False
+            self.assertRaises(ValueError, paddle.tile, x3, repeat_times)
+
+
+class TestTileAPIStatic(unittest.TestCase):
+    def test_api(self):
+        with program_guard(Program(), Program()):
+            repeat_times = [2, 2]
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype="int32")
+            out = paddle.tile(x1, repeat_times)
+            positive_2 = fluid.layers.fill_constant([1], dtype="int32", value=2)
+            out2 = paddle.tile(x1, repeat_times=[positive_2, 2])
+
+
+# Test python API
+class TestTileAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard():
+            np_x = np.random.random([12, 14]).astype("float32")
+            x = paddle.to_variable(np_x)
+
+            positive_2 = np.array([2]).astype("int32")
+            positive_2 = paddle.to_variable(positive_2)
+
+            repeat_times = np.array([2, 3]).astype("int32")
+            repeat_times = paddle.to_variable(repeat_times)
+
+            out_1 = paddle.tile(x, repeat_times=[2, 3])
+            out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
+            out_3 = paddle.tile(x, repeat_times=repeat_times)
+
+            assert np.array_equal(out_1.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_2.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e7765c0fb76844a6123fceea6c1ef79dc0c2bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -0,0 +1,244 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkOp(OpTest):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 20)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestTopOp1(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = True
+
+
+class TestTopOp2(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp3(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopOp4(TestTopkOp):
+    def init_args(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkOp5(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp6(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float64
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8d1e77134036bf7b28d4afb8bacaa44092b1053
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
+
+import unittest
+
+
+def generate_basic_params(mode="attn", self_attention=True):
+    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
+    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
+    attn_dropout = 0.0
+    embed_dim = d_head * num_heads
+    if mode == "attn":
+        if self_attention:
+            kdim, vdim = embed_dim, embed_dim
+            key_length, value_length = query_length, query_length
+        else:
+            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
+            key_length = np.random.randint(2, 10)
+            value_length = key_length
+        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
+
+    else:
+        dropout, act_dropout = 0.0, 0.0
+        dim_feedforward = np.random.randint(128, 1024)
+        sequence_length = np.random.randint(2, 10)
+        if mode == "encoder_layer":
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
+        elif mode == "decoder_layer":
+            target_length = np.random.randint(2, 10)
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
+
+
+def generate_query_key_value_cache(self_attention,
+                                   batch_size,
+                                   num_heads,
+                                   query_length,
+                                   embed_dim,
+                                   key_length=None,
+                                   value_length=None,
+                                   kdim=None,
+                                   vdim=None,
+                                   cache=None):
+    query = np.random.rand(batch_size, query_length,
+                           embed_dim).astype("float32")
+    attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
+    attn_mask[0][0][0][0] = -1e9
+
+    head_dim = embed_dim // num_heads
+    if self_attention:
+        key, value = query, query
+    else:
+        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
+        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
+    cache_dict = {}
+    if cache:
+        if not self_attention:
+            cache_dict["static_k"] = np.random.rand(
+                batch_size, num_heads, key_length, head_dim).astype("float32")
+            cache_dict["static_v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+        else:
+            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
+                                             head_dim).astype("float32")
+            cache_dict["v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+    else:
+        cache_dict = None
+    return query, key, value, attn_mask, cache_dict
+
+
+def fc(x, weight):
+    return np.matmul(x, weight)
+
+
+def softmax(x):
+    np.seterr(invalid='ignore')
+    output = np.zeros(x.shape, dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            for k in range(x.shape[2]):
+                x_curr = x[i, j, k, :]
+                e_x = np.exp(x_curr - np.amax(x_curr))
+                output[i, j, k, :] = e_x / np.sum(e_x)
+    return output
+
+
+def batch_matmul(x, y):
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] == y.shape[1]
+    retval = np.zeros(
+        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
+    return retval
+
+
+def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
+    k = k.transpose([0, 1, 3, 2])
+    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
+    if attn_mask is not None:
+        qkt += attn_mask
+    weight = softmax(qkt)
+    attn_heads = batch_matmul(weight, v)
+    attn_heads = attn_heads.transpose((0, 2, 1, 3))
+    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
+                                     attn_heads.shape[2] * attn_heads.shape[3]))
+    return attn_heads
+
+
+def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
+    with fluid.dygraph.guard():
+        head_dim = embed_dim // num_heads
+        k_weight = multi_head_attn.k_proj.weight.numpy()
+        v_weight = multi_head_attn.v_proj.weight.numpy()
+        k = fc(key, k_weight)
+        v = fc(value, v_weight)
+        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
+        k = k.transpose((0, 2, 1, 3))
+        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
+        v = v.transpose((0, 2, 1, 3))
+        return k, v
+
+
+def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
+                multi_head_attn, cache_dict):
+    q_weight = multi_head_attn.q_proj.weight.numpy()
+    q = fc(query, q_weight)
+    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
+    q = q.transpose((0, 2, 1, 3))
+
+    if not self_attention and cache_dict:
+        k, v = cache_dict["static_k"], cache_dict["static_v"]
+    else:
+        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
+        if cache_dict is not None:
+            k = np.concatenate((cache_dict["k"], k), axis=2)
+            v = np.concatenate((cache_dict["v"], v), axis=2)
+    return (q, k, v, cache_dict)
+
+
+def add(x, y=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        x = x.numpy() if not isinstance(x, np.ndarray) else x
+        if y is not None:
+            x += y
+            return x
+        return x
+
+
+def relu(x):
+    compare = x > 0
+    return x * compare
+
+
+def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        # scale:
+        weight = norm.weight.numpy()
+        # shift:
+        bias = norm.bias.numpy()
+
+        batch_size, src_len, d_model = x.shape
+        x = x.reshape((batch_size * src_len, d_model))
+        mu = np.mean(x, axis=1, keepdims=True)
+        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+        x1_up = (x - mu)
+        x1_down_1 = sigma_squar + epsilon
+        x1_down = np.sqrt(x1_down_1)
+        x1_down = x1_down.reshape((x1_down.shape[0], 1))
+        x1 = x1_up / x1_down
+        x_scaled = weight * x1
+        x_scaled_bias = x_scaled + bias
+        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def ffn(src, encoder_layer, ffn_fc1_act="relu"):
+    assert ffn_fc1_act == "relu", "only relu is supported"
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        src = src.numpy() if not isinstance(src, np.ndarray) else src
+        w1 = encoder_layer.linear1.weight.numpy()
+        w2 = encoder_layer.linear2.weight.numpy()
+        # fc1
+        x1 = fc(src, w1)
+        x1 = relu(x1)
+        # fc2
+        x2 = fc(x1, w2)
+        return x2
+
+
+class TestTransformer(unittest.TestCase):
+    def test_multi_head_attention(self):
+        def multihead_attention_test_helper(self_attention, cache):
+            paddle.framework.manual_seed(2020)
+            # self_attention|cross_attention, cache|No cache
+            with fluid.dygraph.guard(fluid.CPUPlace()):
+
+                # generate params for multi_head_attention
+                batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
+                    "attn", self_attention)
+                query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
+                    self_attention, batch_size, num_heads, query_length,
+                    embed_dim, key_length, value_length, kdim, vdim, cache)
+                if cache and self_attention:
+                    attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
+                need_weight, param_attr, bias_attr = False, None, None
+                # call paddle's function
+                multi_head_attn = MultiHeadAttention(
+                    embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
+                    param_attr, bias_attr)
+                # construct cache object
+                cache_obj = None
+                if cache_dict:
+                    if 'k' and 'v' in cache_dict:
+                        cache_obj = multi_head_attn.Cache(
+                            paddle.to_variable(cache_dict['k']),
+                            paddle.to_variable(cache_dict['v']))
+                    elif 'static_k' and 'static_v' in cache_dict:
+                        cache_obj = multi_head_attn.StaticCache(
+                            paddle.to_variable(cache_dict['static_k']),
+                            paddle.to_variable(cache_dict['static_v']))
+                if attn_mask is not None:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value),
+                        paddle.to_variable(attn_mask), cache_obj)
+                else:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value), attn_mask, cache_obj)
+                attn_output = attn_output[0] if cache_dict else attn_output
+
+                # implementation by numpy
+                # compute q, k, v
+                q, k, v, _ = prepare_qkv(query, key, value, num_heads,
+                                         embed_dim, self_attention,
+                                         multi_head_attn, cache_dict)
+                # scale dot product attention
+                attn_heads = scaled_dot_product_attention(
+                    q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn)
+                out_proj_weight = multi_head_attn.out_proj.weight.numpy()
+                reference = fc(attn_heads, out_proj_weight)
+
+                np.testing.assert_allclose(
+                    attn_output.numpy(), reference, atol=1e-6)
+
+        multihead_attention_test_helper(True, True)
+        multihead_attention_test_helper(True, False)
+        multihead_attention_test_helper(False, True)
+        multihead_attention_test_helper(False, False)
+
+    def test_transformer_encoder_layer(self):
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+
+            ffn_fc1_act = "relu"
+            # 1.generate basic params
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+                mode="encoder_layer")
+            # 2.generate input for encoder
+            src = np.random.rand(batch_size, sequence_length,
+                                 d_model).astype("float32")
+            residual = src
+            src_mask = np.zeros((batch_size, n_head, sequence_length,
+                                 sequence_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+
+            # paddle
+            encoder_layer = TransformerEncoderLayer(
+                d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
+                attn_dropout, act_dropout)
+
+            encoder_output = encoder_layer(
+                paddle.to_variable(src),
+                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+            # 4.numpy:
+            # paddle self attention
+            self_attn = MultiHeadAttention(
+                d_model, n_head, dropout=attn_dropout)
+            attn_output = self_attn(
+                paddle.to_variable(src),
+                paddle.to_variable(src),
+                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+
+            src = attn_output + residual
+            src_norm = layer_norm(src, d_model, encoder_layer.norm1)
+            residual = src_norm
+
+            ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
+            src = residual + ffn_output
+            src = layer_norm(src, d_model, encoder_layer.norm2)
+
+            np.testing.assert_allclose(
+                encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
+
+    def test_transformer_decoder_layer(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            activation = "relu"
+            normalize_before = False
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
+                mode="decoder_layer")
+            tgt = np.random.rand(batch_size, target_length,
+                                 d_model).astype("float32")
+            memory = np.random.rand(batch_size, source_length,
+                                    d_model).astype("float32")
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            for cache in [True, False]:
+                self_attn = MultiHeadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+                cross_attn = MultiHeadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+
+                # paddle decoderlayer:
+                decoder_layer = TransformerDecoderLayer(
+                    d_model, n_head, dim_feedforward, dropout, activation,
+                    attn_dropout, act_dropout, normalize_before)
+                cache_objs = None
+                if cache:
+                    cache_objs = decoder_layer.gen_cache(
+                        paddle.to_variable(memory))
+
+                decoder_output = decoder_layer(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(tgt_mask),
+                    paddle.to_variable(memory_mask), cache_objs)
+
+                decoder_output = decoder_output[0].numpy(
+                ) if cache else decoder_output.numpy()
+
+                # numpy:
+                residual = tgt
+                # self-attn
+                self_attn_cache = cache_objs[
+                    0] if cache_objs is not None else None
+                tgt = self_attn(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt_mask), self_attn_cache)
+
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                tgt = residual + tgt
+                # postprocess
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
+                residual = tgt_norm
+                # cross-attn
+                cross_attn_cache = cache_objs[
+                    1] if cache_objs is not None else None
+                tgt = cross_attn(
+                    paddle.to_variable(tgt_norm),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory_mask), cross_attn_cache)
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                # postprocess
+                tgt = tgt + residual
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
+                residual = tgt_norm
+                # FFN
+                ffn_output = ffn(tgt_norm, decoder_layer, activation)
+                # post process
+                tgt = residual + ffn_output
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
+
+                np.testing.assert_allclose(
+                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
+
+    def test_encoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+            mode="encoder_layer")
+
+        src = np.random.rand(batch_size, sequence_length,
+                             d_model).astype("float32")
+
+        src_mask = np.zeros((batch_size, n_head, sequence_length,
+                             sequence_length)).astype("float32")
+        src_mask[0][0][0][0] = -np.inf
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            encoder = TransformerEncoder(encoder_layer, num_layers)
+            # src, src_mask
+            enc_output = encoder(
+                paddle.to_variable(src), paddle.to_variable(src_mask))
+
+    def test_decoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+        tgt = np.random.rand(batch_size, target_length,
+                             d_model).astype("float32")
+        memory = np.random.rand(batch_size, source_length,
+                                d_model).astype("float32")
+        tgt_mask = np.zeros((batch_size, n_head, target_length,
+                             target_length)).astype("float32")
+        tgt_mask[0][0][0][0] = -1e9
+        memory_mask = np.zeros((batch_size, n_head, target_length,
+                                source_length)).astype("float32")
+        memory_mask[0][0][0][0] = -1e9
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            decoder_layer = TransformerDecoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            decoder = TransformerDecoder(decoder_layer, num_layers)
+
+            output = decoder(
+                paddle.to_variable(tgt),
+                paddle.to_variable(memory),
+                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+
+    def test_transformer(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index 0f14c9d1c3ba99b5c9b1500e4b7ddabb690e9290..aed265b21b5781d88da0380b04872061e893d736 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -63,7 +63,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
         "diagonal: TypeError":
         "diagonal in {} must be a python Int".format(op_type),
         "input: ValueError":
-        "input shape in {} must be at least 2-D".format(op_type),
+        "x shape in {} must be at least 2-D".format(op_type),
     }
 
     class FailureCase(unittest.TestCase):
@@ -71,7 +71,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
             data = fluid.data(shape=Xshape, dtype='float64', name=cls_name)
             with self.assertRaisesRegexp(
                     eval(expected.split(':')[-1]), errmsg[expected]):
-                getattr(tensor, op_type)(input=data, diagonal=diagonal)
+                getattr(tensor, op_type)(x=data, diagonal=diagonal)
 
     class SuccessCase(TrilTriuOpDefaultTest):
         def initTestCase(self):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 9a64dd1deea93f473d73d485ec5a9d707aaa54f9..158462a1e6e1012b7473a2410f2c003d04ea2e40 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -14,9 +14,12 @@
 
 from __future__ import print_function
 
+import sys
+import subprocess
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
@@ -472,5 +475,61 @@ class TestUniformRandomBatchSizeLikeOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_dtype)
 
 
+class TestUniformAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.uniform([2, 3], min=-5.0, max=5.0)
+        paddle.tensor.uniform([2, 3], min=-5.0, max=5.0)
+        paddle.tensor.random.uniform([2, 3], min=-5.0, max=5.0)
+
+        def test_uniform_random():
+            paddle.tensor.random.uniform_random([2, 3], min=-5.0, max=5.0)
+
+        self.assertRaises(AttributeError, test_uniform_random)
+
+
+class TestUniformOpError(unittest.TestCase):
+    def test_errors(self):
+        main_prog = Program()
+        start_prog = Program()
+        with program_guard(main_prog, start_prog):
+
+            def test_Variable():
+                x1 = fluid.create_lod_tensor(
+                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.tensor.random.uniform(x1)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_Variable2():
+                x1 = np.zeros((4, 784))
+                paddle.tensor.random.uniform(x1)
+
+            self.assertRaises(TypeError, test_Variable2)
+
+            def test_dtype():
+                x2 = fluid.layers.data(
+                    name='x2', shape=[4, 784], dtype='float32')
+                paddle.tensor.random.uniform(x2, 'int32')
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_out_dtype():
+                out = paddle.tensor.random.uniform(
+                    shape=[3, 4], dtype='float64')
+                self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+            test_out_dtype()
+
+
+class TestUniformDygraphMode(unittest.TestCase):
+    def test_check_output(self):
+        with fluid.dygraph.guard():
+            x = paddle.tensor.random.uniform(
+                [10], dtype="float32", min=0.0, max=1.0)
+            x_np = x.numpy()
+            for i in range(10):
+                self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 65194524adfcd7f48efe0f2378c95dba7a8c4a5e..ae36f8a9861560b51a78617d1b66aa7743b87580 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -125,5 +126,164 @@ class TestRandomGPU(TestUniqueOp):
             self.check_output_with_place(place, atol=1e-5)
 
 
+class TestSortedUniqueOp(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
+        unique, indices, inverse, count = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": count,
+        }
+
+
+class TestUniqueOpAxisNone(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=None)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": None,
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueOpAxis1(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=1)
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": [1],
+            "is_sorted": True
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
+class TestUniqueAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        x_data = x_data = np.random.randint(0, 10, (120))
+        x = paddle.to_tensor(x_data)
+        out = paddle.unique(x)
+        expected_out = np.unique(x_data)
+        self.assertTrue((out.numpy() == expected_out).all(), True)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        x_data = np.random.random((3, 5, 5)).astype("float32")
+        x = paddle.to_tensor(x_data)
+        out, index, inverse, counts = paddle.unique(
+            x,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        np_out, np_index, np_inverse, np_counts = np.unique(
+            x_data,
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=0)
+        self.assertTrue((out.numpy() == np_out).all(), True)
+        self.assertTrue((index.numpy() == np_index).all(), True)
+        self.assertTrue((inverse.numpy() == np_inverse).all(), True)
+        self.assertTrue((counts.numpy() == np_counts).all(), True)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.data(name='x', shape=[3, 2], dtype='float64')
+            unique, inverse, counts = paddle.unique(
+                x, return_inverse=True, return_counts=True, axis=0)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
+            result = exe.run(feed={"x": x_np},
+                             fetch_list=[unique, inverse, counts])
+        np_unique, np_inverse, np_counts = np.unique(
+            x_np, return_inverse=True, return_counts=True, axis=0)
+        self.assertTrue(np.allclose(result[0], np_unique))
+        self.assertTrue(np.allclose(result[1], np_inverse))
+        self.assertTrue(np.allclose(result[2], np_counts))
+
+
+class TestUniqueError(unittest.TestCase):
+    def test_input_dtype(self):
+        def test_x_dtype():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                result = paddle.unique(x)
+
+            self.assertRaises(TypeError, test_x_dtype)
+
+    def test_attr(self):
+        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+
+        def test_return_index():
+            result = paddle.unique(x, return_index=0)
+
+        self.assertRaises(TypeError, test_return_index)
+
+        def test_return_inverse():
+            result = paddle.unique(x, return_inverse='s')
+
+        self.assertRaises(TypeError, test_return_inverse)
+
+        def test_return_counts():
+            result = paddle.unique(x, return_counts=3)
+
+        self.assertRaises(TypeError, test_return_counts)
+
+        def test_axis():
+            result = paddle.unique(x, axis='12')
+
+        self.assertRaises(TypeError, test_axis)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 1b353e1379076cc71e8013487e0b22f5bf03dc09..9382d53e7fec6ba9e1217f99ba5006b3dfe5c150 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -81,7 +81,7 @@ class API_TestUnsqueeze(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 10], dtype='float64')
-            result_squeeze = paddle.unsqueeze(data1, axes=[1])
+            result_squeeze = paddle.unsqueeze(data1, axis=[1])
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input1 = np.random.random([5, 1, 10]).astype('float64')
@@ -98,7 +98,7 @@ class TestUnsqueezeOpError(unittest.TestCase):
             def test_axes_type():
                 x6 = fluid.layers.data(
                     shape=[-1, 10], dtype='float16', name='x3')
-                paddle.unsqueeze(x6, axes=3.2)
+                paddle.unsqueeze(x6, axis=3.2)
 
             self.assertRaises(TypeError, test_axes_type)
 
@@ -108,7 +108,7 @@ class API_TestUnsqueeze2(unittest.TestCase):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.data('data1', shape=[-1, 10], dtype='float64')
             data2 = fluid.data('data2', shape=[1], dtype='int32')
-            result_squeeze = paddle.unsqueeze(data1, axes=data2)
+            result_squeeze = paddle.unsqueeze(data1, axis=data2)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input1 = np.random.random([5, 1, 10]).astype('float64')
@@ -125,7 +125,7 @@ class API_TestUnsqueeze3(unittest.TestCase):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.data('data1', shape=[-1, 10], dtype='float64')
             data2 = fluid.data('data2', shape=[1], dtype='int32')
-            result_squeeze = paddle.unsqueeze(data1, axes=[data2, 3])
+            result_squeeze = paddle.unsqueeze(data1, axis=[data2, 3])
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input1 = np.random.random([5, 1, 10, 1]).astype('float64')
@@ -143,7 +143,7 @@ class API_TestDyUnsqueeze(unittest.TestCase):
             input_1 = np.random.random([5, 1, 10]).astype("int32")
             input1 = np.squeeze(input_1, axis=1)
             input = fluid.dygraph.to_variable(input_1)
-            output = paddle.unsqueeze(input, axes=[1])
+            output = paddle.unsqueeze(input, axis=[1])
             out_np = output.numpy()
             self.assertTrue(np.allclose(input1, out_np))
 
@@ -154,7 +154,7 @@ class API_TestDyUnsqueeze2(unittest.TestCase):
             input_1 = np.random.random([5, 1, 10]).astype("int32")
             input1 = np.squeeze(input_1, axis=1)
             input = fluid.dygraph.to_variable(input_1)
-            output = paddle.unsqueeze(input, axes=1)
+            output = paddle.unsqueeze(input, axis=1)
             out_np = output.numpy()
             self.assertTrue(np.allclose(input1, out_np))
 
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 7e565ca31b219366b7ab83267b46f32e5812d983..80b94704c388824901312b5d577cb5cfd0d0c75b 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -28,6 +29,74 @@ class TestVarBase(unittest.TestCase):
         self.dtype = np.float32
         self.array = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
 
+    def test_to_tensor(self):
+        def _test_place(place):
+            with fluid.dygraph.guard():
+                x = paddle.to_tensor(
+                    1, dtype='float32', place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1.]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                self.assertEqual(x.shape, [1])
+                self.assertEqual(x.stop_gradient, False)
+                self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                x = paddle.to_tensor(
+                    (1, 2), dtype='float32', place=place, stop_gradient=False)
+                x = paddle.to_tensor(
+                    [1, 2], dtype='float32', place=place, stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), [1., 2.]))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                self.assertEqual(x.grad, None)
+                self.assertEqual(x.shape, [2])
+                self.assertEqual(x.stop_gradient, False)
+                self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                x = paddle.to_tensor(
+                    self.array,
+                    dtype='float32',
+                    place=place,
+                    stop_gradient=False)
+                self.assertTrue(np.array_equal(x.numpy(), self.array))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                self.assertEqual(x.shape, self.shape)
+                self.assertEqual(x.stop_gradient, False)
+                self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                y = paddle.to_tensor(x)
+                y = paddle.to_tensor(y, dtype='float64', place=place)
+                self.assertTrue(np.array_equal(y.numpy(), self.array))
+                self.assertEqual(y.dtype, core.VarDesc.VarType.FP64)
+                self.assertEqual(y.shape, self.shape)
+                self.assertEqual(y.stop_gradient, True)
+                self.assertEqual(y.type, core.VarDesc.VarType.LOD_TENSOR)
+                z = x + y
+                self.assertTrue(np.array_equal(z.numpy(), 2 * self.array))
+
+                x = paddle.to_tensor(
+                    [1 + 2j, 1 - 2j], dtype='complex64', place=place)
+                y = paddle.to_tensor(x)
+                self.assertTrue(np.array_equal(x.numpy(), [1 + 2j, 1 - 2j]))
+                self.assertEqual(y.dtype, 'complex64')
+                self.assertEqual(y.shape, [2])
+                self.assertEqual(y.real.stop_gradient, True)
+                self.assertEqual(y.real.type, core.VarDesc.VarType.LOD_TENSOR)
+
+                with self.assertRaises(TypeError):
+                    paddle.to_tensor('test')
+                with self.assertRaises(TypeError):
+                    paddle.to_tensor(1, dtype='test')
+                with self.assertRaises(ValueError):
+                    paddle.to_tensor([[1], [2, 3]])
+                with self.assertRaises(ValueError):
+                    paddle.to_tensor([[1], [2, 3]], place='test')
+                with self.assertRaises(ValueError):
+                    paddle.to_tensor([[1], [2, 3]], place=1)
+
+        _test_place(core.CPUPlace())
+        if core.is_compiled_with_cuda():
+            _test_place(core.CUDAPinnedPlace())
+            _test_place(core.CUDAPlace(0))
+
     def test_to_variable(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array, name="abc")
@@ -76,7 +145,7 @@ class TestVarBase(unittest.TestCase):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array)
 
-            self.assertEqual(var.name, 'generated_var_0')
+            self.assertEqual(var.name, 'generated_tensor_0')
             var.name = 'test'
             self.assertEqual(var.name, 'test')
 
diff --git a/python/paddle/fluid/tests/unittests/test_variance_layer.py b/python/paddle/fluid/tests/unittests/test_variance_layer.py
index 569f064db8549b5f28bc751a36cbe4b379636379..b5bb3cc978a558bb52f5f56c58f107b653956a75 100644
--- a/python/paddle/fluid/tests/unittests/test_variance_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_variance_layer.py
@@ -15,65 +15,104 @@
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 
 
-class TestVarianceLayer(unittest.TestCase):
+def ref_var(x, axis=None, unbiased=True, keepdim=False):
+    ddof = 1 if unbiased else 0
+    if isinstance(axis, int):
+        axis = (axis, )
+    if axis is not None:
+        axis = tuple(axis)
+    return np.var(x, axis=axis, ddof=ddof, keepdims=keepdim)
+
+
+class TestVarAPI(unittest.TestCase):
     def setUp(self):
-        self._dtype = "float64"
-        self._input = np.random.random([2, 3, 4, 5]).astype(self._dtype)
-
-    def static(self, axis=None, keepdim=False, unbiased=True):
-        prog = fluid.Program()
-        with fluid.program_guard(prog):
-            data = fluid.data(
-                name="data", dtype=self._dtype, shape=[None, 3, 4, 5])
-            out = prog.current_block().create_var(
-                dtype=self._dtype, shape=[2, 3, 4, 5])
-            paddle.var(input=data,
-                       axis=axis,
-                       keepdim=keepdim,
-                       unbiased=unbiased,
-                       out=out)
-
-        exe = fluid.Executor(self._place)
-        return exe.run(feed={"data": self._input},
-                       program=prog,
-                       fetch_list=[out])[0]
-
-    def dynamic(self, axis=None, keepdim=False, unbiased=True):
-        with fluid.dygraph.guard(self._place):
-            data = fluid.dygraph.to_variable(self._input)
-            out = paddle.var(input=data,
-                             axis=axis,
-                             keepdim=keepdim,
-                             unbiased=unbiased)
-            return out.numpy()
-
-    def numpy(self, axis=None, keepdim=False, unbiased=True):
-        ddof = 1 if unbiased else 0
-        axis = tuple(axis) if isinstance(axis, list) else axis
-        return np.var(self._input, axis=axis, keepdims=keepdim, ddof=ddof)
-
-    def test_equal(self):
-        places = [fluid.CPUPlace()]
-        if fluid.core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            self._place = place
-            self.assertTrue(np.allclose(self.numpy(), self.static()))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(axis=[0, 2]), self.dynamic(axis=[0, 2])))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(
-                        axis=[1, 3], keepdim=True),
-                    self.dynamic(
-                        axis=[1, 3], keepdim=True)))
-            self.assertTrue(
-                np.allclose(
-                    self.numpy(unbiased=False), self.dynamic(unbiased=False)))
+        self.dtype = 'float64'
+        self.shape = [1, 3, 4, 10]
+        self.axis = [1, 3]
+        self.keepdim = False
+        self.unbiased = True
+        self.set_attrs()
+        self.x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        self.place=paddle.CUDAPlace(0) \
+            if paddle.fluid.core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def set_attrs(self):
+        pass
+
+    def static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.shape, self.dtype)
+            out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x}, fetch_list=[out])
+        return res[0]
+
+    def dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
+        paddle.enable_static()
+        return out.numpy()
+
+    def test_api(self):
+        out_ref = ref_var(self.x, self.axis, self.unbiased, self.keepdim)
+        out_dygraph = self.dygraph()
+        out_static = self.static()
+        for out in [out_dygraph, out_static]:
+            self.assertTrue(np.allclose(out_ref, out))
+            self.assertTrue(np.equal(out_ref.shape, out.shape).all())
+
+
+class TestVarAPI_dtype(TestVarAPI):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestVarAPI_axis_int(TestVarAPI):
+    def set_attrs(self):
+        self.axis = 2
+
+
+class TestVarAPI_axis_list(TestVarAPI):
+    def set_attrs(self):
+        self.axis = [1, 2]
+
+
+class TestVarAPI_axis_tuple(TestVarAPI):
+    def set_attrs(self):
+        self.axis = (1, 3)
+
+
+class TestVarAPI_keepdim(TestVarAPI):
+    def set_attrs(self):
+        self.keepdim = False
+
+
+class TestVarAPI_unbiased(TestVarAPI):
+    def set_attrs(self):
+        self.unbiased = False
+
+
+class TestVarAPI_alias(unittest.TestCase):
+    def test_alias(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([10, 12], 'float32'))
+        out1 = paddle.var(x).numpy()
+        out2 = paddle.tensor.var(x).numpy()
+        out3 = paddle.tensor.stat.var(x).numpy()
+        self.assertTrue(np.allclose(out1, out2))
+        self.assertTrue(np.allclose(out1, out3))
+        paddle.enable_static()
+
+
+class TestVarError(unittest.TestCase):
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [2, 3, 4], 'int32')
+            self.assertRaises(TypeError, paddle.var, x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 449ac959188949559056654418ace3e227c368da..6bc42f0712a1a8c9f9a0640e06042c42e7cc948f 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -21,25 +21,25 @@ from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
+import paddle.nn.functional as F
 
 CUDA_BLOCK_SIZE = 512
 
 
 class CTCForward(object):
-    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
-                 norm_by_times):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, num_classes,
+                 batch_size, blank, norm_by_times):
         self.softmax = softmax
         self.softmax_lod = softmax_lod
-        assert labels.shape[1] == 1
         self.labels = labels
         self.labels_lod = labels_lod
         self.blank = blank
         self.norm_by_times = norm_by_times
 
         self.level = 0
-        self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level])
-        assert self.batch_size == len(labels_lod[self.level])
+        self.num_classes = num_classes
+        self.batch_size = batch_size
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -163,17 +163,25 @@ class CTCForward(object):
         softmax_offset = 0
         labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = softmax_offset
-            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
-            labels_start_i = labels_offset
-            labels_end_i = labels_offset + self.labels_lod[self.level][i]
-
-            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
-            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
-            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
-                                                   labels_a_sequence)
-            softmax_offset += self.softmax_lod[self.level][i]
-            labels_offset += self.labels_lod[self.level][i]
+            if self.labels.shape[1] == 1:
+                softmax_start_i = softmax_offset
+                softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+                labels_start_i = labels_offset
+                labels_end_i = labels_offset + self.labels_lod[self.level][i]
+
+                softmax_a_sequence = self.softmax[softmax_start_i:
+                                                  softmax_end_i, :]
+                labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+                softmax_offset += self.softmax_lod[self.level][i]
+                labels_offset += self.labels_lod[self.level][i]
+            else:
+                softmax_a_sequence = self.softmax[:self.softmax_lod[i], i, :]
+                labels_a_sequence = self.labels[:self.labels_lod[i], :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+
         return self.loss
 
 
@@ -201,7 +209,8 @@ class TestWarpCTCOp(OpTest):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -223,7 +232,7 @@ class TestWarpCTCOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -237,7 +246,7 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
         self.num_classes = CUDA_BLOCK_SIZE + 2
         self.logits_lod = [[4, 1, 3, 3]]
         self.labels_lod = [[3, 1, 4, 4]]
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -267,7 +276,8 @@ class TestWarpCTCOpWithPadding(OpTest):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -317,7 +327,7 @@ class TestWarpCTCOpWithPadding(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -333,7 +343,7 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
         self.labels_lod = [[3, 1, 4, 4]]
         self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
         self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -389,5 +399,97 @@ class TestWarpCTCOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_label_len_Variable)
 
 
+class TestCTCLossAPICase(unittest.TestCase):
+    def test_functinal_api(self):
+        self.batch_size = 4
+        self.num_classes = CUDA_BLOCK_SIZE + 2
+        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+        loss_pd_mean = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='mean')
+        loss_pd_mean = loss_pd_mean.numpy()
+
+        loss_pd_sum = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='sum')
+        loss_pd_sum = loss_pd_sum.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+        loss_np_mean = (loss_np / labels_length.numpy()).mean()
+        loss_np_sum = loss_np.sum()
+
+        self.assertTrue(np.allclose(loss_pd_mean, loss_np_mean, atol=1))
+        self.assertTrue(np.allclose(loss_pd_sum, loss_np_sum, atol=1))
+
+    def test_class_api(self):
+        self.batch_size = 3
+        self.num_classes = 15
+        self.logits_length = np.array([3, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([0, 1, 2], dtype=np.int64)
+        self.blank = 0
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            1,
+            self.num_classes, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+
+        loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
+            softmax, labels, logits_length, labels_length)
+        loss_pd = loss_pd.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+
+        self.assertTrue(np.allclose(loss_pd, loss_np, atol=1))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 448751f19dbe76fdbd856d0464e36390c69aba41..21e618a46201659fe0c4e5c67d1d9a8bafd70f1b 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -62,20 +62,19 @@ class TestZerosLikeImpeartive(unittest.TestCase):
         shape = [3, 4]
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        with paddle.imperative.guard(place):
-            x = paddle.imperative.to_variable(np.ones(shape))
-            for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
-                out = zeros_like(x, dtype)
-                self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
-                                 True)
-
-            out = paddle.tensor.zeros_like(x)
+        paddle.disable_static(place)
+        x = paddle.to_variable(np.ones(shape))
+        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+            out = zeros_like(x, dtype)
             self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
                              True)
 
-            out = paddle.tensor.creation.zeros_like(x)
-            self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
-                             True)
+        out = paddle.tensor.zeros_like(x)
+        self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(), True)
+
+        out = paddle.tensor.creation.zeros_like(x)
+        self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(), True)
+        paddle.enable_static()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_op.py b/python/paddle/fluid/tests/unittests/test_zeros_op.py
index 0cf51a87cf6b844c053ab1335e20df108d16e177..23dec935507fd977f884e952451b5ea98c935893 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_op.py
@@ -39,15 +39,15 @@ class ApiZerosTest(unittest.TestCase):
         with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="float64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(Program()):
+        with paddle.static.program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -55,7 +55,7 @@ class ApiZerosTest(unittest.TestCase):
         with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -64,7 +64,7 @@ class ApiZerosTest(unittest.TestCase):
             out_np = np.zeros(shape=(1), dtype='float32')
             out = paddle.zeros(shape=[1], dtype="float32")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result = exe.run(fetch_list=[out])
             self.assertEqual((result == out_np).all(), True)
 
@@ -72,7 +72,7 @@ class ApiZerosTest(unittest.TestCase):
         with program_guard(Program()):
             zeros = fluid.layers.zeros(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
-            exe = paddle.Executor(place)
+            exe = paddle.static.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -81,13 +81,13 @@ class ApiZerosTest(unittest.TestCase):
 class ApiZerosError(unittest.TestCase):
     def test_errors(self):
         def test_error1():
-            with paddle.program_guard(fluid.Program()):
+            with paddle.static.program_guard(fluid.Program()):
                 ones = fluid.layers.zeros(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error1)
 
         def test_error2():
-            with paddle.program_guard(fluid.Program()):
+            with paddle.static.program_guard(fluid.Program()):
                 ones = fluid.layers.zeros(shape=[10], dtype="int8")
 
         self.assertRaises(TypeError, test_error2)
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 19af0c92154e0fa1f631ef885588d640a338fe1b..0de0eeb464ad700abb2144e49a822582b8653589 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -17,6 +17,7 @@ no_check_set_white_list = [
     'fake_quantize_range_abs_max',
     'coalesce_tensor',
     'flatten2',
+    'flatten_contiguous_range',
     'lrn',
     'squeeze2',
     'reshape2',
@@ -25,4 +26,5 @@ no_check_set_white_list = [
     'cross_entropy2',
     'seed',
     'amp_check_finite_and_scale',
+    'cudnn_lstm',
 ]
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index ce6868b5c70ae1218df48f899f936f57f6734582..5300ab935a3405f9f76c08a7f2ece8bad33367ac 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -41,7 +41,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'unpool', \
     'yolov3_loss', \
     'inverse', \
-    'bilateral_slice'
+    'bilateral_slice',\
+    'cudnn_lstm'
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 478e05c8975d06d602253d692114f77ca25de0af..9f0089f68ab1efa3dc4bd0e89e87b3b18e44b24f 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -15,7 +15,10 @@
 
 import sys
 import os
-__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer']
+__all__ = [
+    'TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer',
+    'HeterXpuTrainer'
+]
 
 
 class TrainerDesc(object):
@@ -48,6 +51,43 @@ class TrainerDesc(object):
         self._program = None
         self._infer = False
 
+    def _set_heter_info(self, ret):
+        #ret = = fu.split_program_by_device(program)
+        #start_list, end_list, send_list, recv_list, program_list = fu.split_program_by_device(program)
+        #if len(start_list) != 3:
+        #    print("start_list len=", len(start_list), " will not set heter info")
+        #    return
+        #for i in start_list[0]:
+        #    self.proto_desc.op_run_start_idx.append(i)
+        #for i in end_list[0]:
+        #    self.proto_desc.op_run_end_idx.append(i)
+        #for i in send_list[0]:
+        #    self.proto_desc.op_run_send_list.append(i)
+        #for i in recv_list[0]:
+        #    self.proto_desc.op_run_recv_list.append(i)
+        if ret is None:
+            return
+        #for i in ret[0]: # start_list[1]:
+        #    self.proto_desc.xpu_start_idx.append(i)
+        self.proto_desc.xpu_start_idx = ret[0]
+
+        #for i in ret[1]:  #end_list[1]:
+        #    self.proto_desc.o_end_idx.append(i)
+        self.proto_desc.xpu_end_idx = ret[1]
+        for i in ret[2]:  # send_list[1]:
+            self.proto_desc.xpu_send_list.append(i)
+        for i in ret[3]:  # recv_list[1]:
+            self.proto_desc.xpu_recv_list.append(i)
+
+        #for i in start_list[2]:
+        #    self.proto_desc.op_run_end_start_idx.append(i)
+        #for i in end_list[2]:
+        #    self.proto_desc.op_run_end_idx.append(i)
+        #for i in send_list[2]:
+        #    self.proto_desc.op_run_end_send_list.append(i)
+        #for i in recv_list[2]:
+        #    self.proto_desc.op_run_end_recv_list.append(i)
+
     def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period):
         # convert fetch_info to list
         fetch_info = list(fetch_info)
@@ -122,6 +162,10 @@ class TrainerDesc(object):
         for param in dump_param:
             self.proto_desc.dump_param.append(param)
 
+    def _set_worker_places(self, worker_places):
+        for place in worker_places:
+            self.proto_desc.worker_places.append(place)
+
     def _set_thread_barrier(self, thread_barrier):
         self.proto_desc.thread_barrier = thread_barrier
 
@@ -272,6 +316,30 @@ class DistMultiTrainer(TrainerDesc):
         self._device_worker._gen_worker_desc(self.proto_desc)
 
 
+class HeterXpuTrainer(TrainerDesc):
+    """
+    Implement of HeterXpuTrainer.
+    It's for Distributed training.
+    """
+
+    def __init__(self):
+        super(HeterXpuTrainer, self).__init__()
+        pass
+
+    def _set_program(self, program):
+        super(HeterXpuTrainer, self)._set_program(program)
+        self._program = program
+
+    def _gen_trainer_desc(self):
+        super(HeterXpuTrainer, self)._gen_trainer_desc()
+        self.proto_desc.class_name = "HeterXpuTrainer"
+        if self._program == None:
+            raise RuntimeError("None Program")
+        self._device_worker._set_infer(self._infer)
+        self._device_worker._set_program(self._program)
+        self._device_worker._gen_worker_desc(self.proto_desc)
+
+
 class PipelineTrainer(TrainerDesc):
     """
     Implement of PipelineTrainer.
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index c2d80f52b8db8dc9efbde079f93eca4bd5877cc2..f7573f6045dce2178d9c780df6717663f40a0871 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -22,7 +22,7 @@ from paddle.fluid.log_helper import get_logger
 local_logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
-from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer
+from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer
 from .device_worker import Hogwild, DownpourSGD, Section, DownpourSGDOPT
 from .framework import Variable
 from multiprocessing import Process, Manager
@@ -75,6 +75,8 @@ class TrainerFactory(object):
                 if opt_info.get("dump_param") is not None and len(
                         opt_info.get("dump_param")) != 0:
                     trainer._set_dump_param(opt_info["dump_param"])
+                if opt_info.get("worker_places") is not None:
+                    trainer._set_worker_places(opt_info["worker_places"])
                 if opt_info.get("enable_random_dump") is not None:
                     trainer._set_enable_random_dump(opt_info[
                         "enable_random_dump"])
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 3078f432c3a70308c929a4fdface215fb79eebcb..f01dc01973a603a0b6ea08358f73237c68924c78 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -14,23 +14,52 @@
 
 # TODO: import framework api under this directory 
 __all__ = [
-    'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
-    'BuildStrategy', 'CompiledProgram', 'default_main_program',
-    'default_startup_program', 'create_global_var', 'create_parameter', 'Print',
-    'py_func', 'ExecutionStrategy', 'name_scope', 'ParallelExecutor',
-    'ParamAttr', 'Program', 'program_guard', 'Variable', 'WeightNormParamAttr',
-    'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace'
+    'create_global_var', 'create_parameter', 'ParamAttr', 'Variable',
+    'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', 'get_default_dtype',
+    'set_default_dtype'
+]
+
+__all__ += [
+    'BackwardStrategy', 'grad', 'LayerList', 'load', 'save', 'prepare_context',
+    'to_variable', 'no_grad', 'ParallelEnv', 'DataParallel'
+]
+
+__all__ += [
+    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
+    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
 ]
 
 from . import random
 from .random import manual_seed
-from ..fluid.executor import Executor, global_scope, scope_guard
-from ..fluid.backward import append_backward, gradients
-from ..fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
-from ..fluid.framework import default_main_program, default_startup_program, name_scope, Program, program_guard, Variable
-from ..fluid.layers.control_flow import Print
-from ..fluid.layers.nn import py_func
-from ..fluid.parallel_executor import ParallelExecutor
-from ..fluid.param_attr import ParamAttr, WeightNormParamAttr
-from ..fluid.layers.tensor import create_global_var, create_parameter
-from ..fluid.core import CPUPlace, CUDAPlace, CUDAPinnedPlace
+from .framework import get_default_dtype
+from .framework import set_default_dtype
+
+from ..fluid.framework import Variable  #DEFINE_ALIAS
+from ..fluid.framework import ComplexVariable  #DEFINE_ALIAS
+from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
+from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
+from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
+from ..fluid.core import CPUPlace  #DEFINE_ALIAS
+from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
+from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
+from ..fluid.core import VarBase  #DEFINE_ALIAS
+
+from paddle.fluid import core  #DEFINE_ALIAS
+from ..fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
+from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
+
+from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay  #DEFINE_ALIAS
+from ..fluid.dygraph.learning_rate_scheduler import CosineDecay  #DEFINE_ALIAS
+
+BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 65654b59c083086967c1ef78f14b740b0779e722..41ec18ce32d3036c3db86aaa98053f59ff61f717 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -13,5 +13,70 @@
 # limitations under the License.
 
 # TODO: define framework api 
-# __all__ = ['set_default_dtype',
-#            'get_default_dtype']
+from paddle.fluid.layer_helper_base import LayerHelperBase
+from paddle.fluid.data_feeder import convert_dtype
+import numpy as np
+
+__all__ = ['set_default_dtype', 'get_default_dtype']
+
+
+def set_default_dtype(d):
+    """
+    Set default dtype. The default dtype is initially float32
+
+    Args:
+        d(string|np.dtype): the dtype to make the default. It only
+                            supports float16, float32 and float64.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_default_dtype("float32")
+
+    """
+    if isinstance(d, type):
+        if d in [np.float16, np.float32, np.float64]:
+            d = d.__name__
+        else:
+            raise TypeError(
+                "set_default_dtype only supports [float16, float32, float64] "
+                ", but received %s" % d.__name__)
+    else:
+        if d in [
+                'float16', 'float32', 'float64', u'float16', u'float32',
+                u'float64'
+        ]:
+            # this code is a little bit dangerous, since error could happen
+            # when casting no-ascii code to str in python2.
+            # but since the set itself is limited, so currently, it is good.
+            # however, jointly supporting python2 and python3, (as well as python4 maybe)
+            # may still be a long-lasting problem.
+            d = str(d)
+        else:
+            raise TypeError(
+                "set_default_dtype only supports [float16, float32, float64] "
+                ", but received %s" % str(d))
+
+    LayerHelperBase.set_default_dtype(d)
+
+
+def get_default_dtype():
+    """
+    Get the current default dtype. The default dtype is initially float32.
+
+    Args:
+        None.
+    Returns:
+        The default dtype.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.get_default_dtype()
+    """
+    return LayerHelperBase.get_default_dtype()
diff --git a/python/paddle/imperative/__init__.py b/python/paddle/imperative/__init__.py
deleted file mode 100644
index 489888a2fef39b2cca5b918a412d231784471ddc..0000000000000000000000000000000000000000
--- a/python/paddle/imperative/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# define api used to run in imperative mode 
-__all__ = [
-    'BackwardStrategy', 'enabled', 'grad', 'guard', 'LayerList', 'load', 'save',
-    'prepare_context', 'to_variable', 'TracedLayer', 'no_grad', 'ParallelEnv',
-    'ProgramTranslator', 'declarative', 'DataParallel', 'TranslatedLayer', 'jit'
-]
-
-__all__ += [
-    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
-]
-
-from paddle.fluid import core
-from ..fluid.dygraph.base import enabled, guard, no_grad, to_variable, grad
-from ..fluid.dygraph.checkpoint import load_dygraph as load
-from ..fluid.dygraph.checkpoint import save_dygraph as save
-from ..fluid.dygraph.parallel import prepare_context, ParallelEnv, DataParallel
-from ..fluid.dygraph.jit import TracedLayer, declarative
-from ..fluid.dygraph import ProgramTranslator
-from . import jit
-
-from ..fluid.dygraph.learning_rate_scheduler import NoamDecay, PiecewiseDecay, NaturalExpDecay, ExponentialDecay, \
-        InverseTimeDecay, PolynomialDecay, CosineDecay
-
-BackwardStrategy = core.BackwardStrategy
diff --git a/python/paddle/incubate/complex/tensor/linalg.py b/python/paddle/incubate/complex/tensor/linalg.py
index 3badf36280e27c9d7962a2b7b3fff596fd0e8cb3..946a0fd5534d13166706523675c93ef1d01cfa54 100644
--- a/python/paddle/incubate/complex/tensor/linalg.py
+++ b/python/paddle/incubate/complex/tensor/linalg.py
@@ -56,20 +56,20 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
                 # [1.+5.j 5.+9.j]         
     """
     # x = a + bi, y = c + di
-    # mm(x, y) = mm(a, c) - mm(b, d) + (mm(a, d) + mm(b, c))i
+    # P1 = ac; P2 = (a + b)(c + d); P3 = bd; then mm(x, y) = (P1-P3) + (P2-P1-P3)j
     complex_variable_exists([x, y], "matmul")
     a, b = (x.real, x.imag) if is_complex(x) else (x, None)
     c, d = (y.real, y.imag) if is_complex(y) else (y, None)
-    ac = layers.matmul(a, c, transpose_x, transpose_y, alpha, name)
+    P1 = layers.matmul(a, c, transpose_x, transpose_y, alpha, name)
     if is_real(b) and is_real(d):
-        bd = layers.matmul(b, d, transpose_x, transpose_y, alpha, name)
-        real = ac - bd
-        imag = layers.matmul(a, d, transpose_x, transpose_y, alpha, name) + \
-               layers.matmul(b, c, transpose_x, transpose_y, alpha, name)
+        P2 = layers.matmul(a + b, c + d, transpose_x, transpose_y, alpha, name)
+        P3 = layers.matmul(b, d, transpose_x, transpose_y, alpha, name)
+        real = P1 - P3
+        imag = P2 - P1 - P3
     elif is_real(b):
-        real = ac
+        real = P1
         imag = layers.matmul(b, c, transpose_x, transpose_y, alpha, name)
     else:
-        real = ac
+        real = P1
         imag = layers.matmul(a, d, transpose_x, transpose_y, alpha, name)
     return ComplexVariable(real, imag)
diff --git a/python/paddle/incubate/complex/tensor/math.py b/python/paddle/incubate/complex/tensor/math.py
index 5c26d6da8d9bb002a117ee40e0ce209c3fa0db9f..465e4887a1f8a8dc1d53afac8869f0b55776f3d2 100644
--- a/python/paddle/incubate/complex/tensor/math.py
+++ b/python/paddle/incubate/complex/tensor/math.py
@@ -261,8 +261,8 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
             
             case1 = np.random.randn(3, 10, 10).astype('float64') + 1j * np.random.randn(3, 10, 10).astype('float64')
             
-            paddle.enable_imperative()
-            case1 = paddle.imperative.to_variable(case1)
+            paddle.disable_static()
+            case1 = paddle.to_tensor(case1)
             data1 = paddle.complex.trace(case1, offset=1, axis1=1, axis2=2) # data1.shape = [3]
     """
     complex_variable_exists([x], "trace")
@@ -330,8 +330,8 @@ def sum(input, dim=None, keep_dim=False, name=None):
 
     """
     complex_variable_exists([input], "sum")
-    real = math.sum(input.real, dim=dim, keep_dim=keep_dim, name=name)
-    imag = math.sum(input.imag, dim=dim, keep_dim=keep_dim, name=name)
+    real = math.sum(input.real, axis=dim, keepdim=keep_dim, name=name)
+    imag = math.sum(input.imag, axis=dim, keepdim=keep_dim, name=name)
     return ComplexVariable(real, imag)
 
 
diff --git a/python/paddle/incubate/hapi/__init__.py b/python/paddle/incubate/hapi/__init__.py
index a6b5faef57ca95188f0759f53753177e4f5946f3..c0361fa33246ff3315a107c520972ca6bebc8168 100644
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/incubate/hapi/__init__.py
@@ -20,7 +20,6 @@ from . import download
 from . import model
 from .model import *
 
-from . import metrics
 from . import datasets
 from . import distributed
 from . import vision
@@ -39,7 +38,6 @@ __all__ = [
     'datasets',
     'distributed',
     'download',
-    'metrics',
     'vision',
     'text',
     'utils',
diff --git a/python/paddle/incubate/hapi/callbacks.py b/python/paddle/incubate/hapi/callbacks.py
index 741552511f9fdc93d9e370fc7d45f9d84a1d4392..0804708210a9749813e195a8b5579b339986acd6 100644
--- a/python/paddle/incubate/hapi/callbacks.py
+++ b/python/paddle/incubate/hapi/callbacks.py
@@ -295,8 +295,8 @@ class ProgBarLogger(Callback):
             import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
 
-            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
-            labels = [hapi.Input('label', [None, 1], 'int64')]
+            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [hapi.Input([None, 1], 'int64', 'label')]
 
             train_dataset = hapi.datasets.MNIST(mode='train')
 
@@ -305,8 +305,8 @@ class ProgBarLogger(Callback):
 
             optim = fluid.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
-                        loss_function=paddle.nn.CrossEntropyLoss(),
-                        metrics=hapi.metrics.Accuracy())
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
 
             callback = hapi.callbacks.ProgBarLogger(log_freq=10)
             model.fit(train_dataset, batch_size=64, callbacks=callback)
@@ -431,8 +431,8 @@ class ModelCheckpoint(Callback):
             import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
 
-            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
-            labels = [hapi.Input('label', [None, 1], 'int64')]
+            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [hapi.Input([None, 1], 'int64', 'label')]
 
             train_dataset = hapi.datasets.MNIST(mode='train')
 
@@ -441,8 +441,8 @@ class ModelCheckpoint(Callback):
 
             optim = fluid.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
-                        loss_function=paddle.nn.CrossEntropyLoss(),
-                        metrics=hapi.metrics.Accuracy())
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
 
             callback = hapi.callbacks.ModelCheckpoint(save_dir='./temp')
             model.fit(train_dataset, batch_size=64, callbacks=callback)
diff --git a/python/paddle/incubate/hapi/datasets/__init__.py b/python/paddle/incubate/hapi/datasets/__init__.py
index fc5df6401992def4bc37329794e534a832924da3..a88b0e6bbf1975d97bfeb68025b978ce877c6baf 100644
--- a/python/paddle/incubate/hapi/datasets/__init__.py
+++ b/python/paddle/incubate/hapi/datasets/__init__.py
@@ -15,11 +15,41 @@
 from . import folder
 from . import mnist
 from . import flowers
+from . import cifar
+from . import voc2012
+from . import conll05
+from . import imdb
+from . import imikolov
+from . import movielens
+from . import movie_reviews
+from . import uci_housing
+from . import wmt14
+from . import wmt16
 
 from .folder import *
 from .mnist import *
 from .flowers import *
+from .cifar import *
+from .voc2012 import *
+from .conll05 import *
+from .imdb import *
+from .imikolov import *
+from .movielens import *
+from .movie_reviews import *
+from .uci_housing import *
+from .wmt14 import *
+from .wmt16 import *
 
 __all__ = folder.__all__ \
-        + mnist.__all__ \
-        + flowers.__all__
+          + mnist.__all__ \
+          + flowers.__all__ \
+          + cifar.__all__ \
+          + voc2012.__all__ \
+          + conll05.__all__ \
+          + imdb.__all__ \
+          + imikolov.__all__ \
+          + movielens.__all__ \
+          + movie_reviews.__all__ \
+          + uci_housing.__all__ \
+          + wmt14.__all__ \
+          + wmt16.__all__
diff --git a/python/paddle/incubate/hapi/datasets/cifar.py b/python/paddle/incubate/hapi/datasets/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..adfa786e615368ba90dab154924678de79104b55
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/cifar.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import tarfile
+import numpy as np
+import six
+from six.moves import cPickle as pickle
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Cifar10', 'Cifar100']
+
+URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+MODE_FLAG_MAP = {
+    'train10': 'data_batch',
+    'test10': 'test_batch',
+    'train100': 'train',
+    'test100': 'test'
+}
+
+
+class Cifar10(Dataset):
+    """
+    Implementation of `Cifar-10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_
+    dataset, which has 10 categories.
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' mode. Default 'train'.
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of cifar-10 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Cifar10
+	    from paddle.incubate.hapi.vision.transforms import Normalize
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+		    self.fc = paddle.nn.Linear(3072, 10, act='softmax')
+
+		def forward(self, image, label):
+		    image = paddle.reshape(image, (3, -1))
+		    return self.fc(image), label
+
+	    paddle.disable_static()
+
+	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
+				std=[0.5, 0.5, 0.5])
+	    cifar10 = Cifar10(mode='train', transform=normalize)
+
+	    for i in range(10):
+		image, label = cifar10[i]
+		image = paddle.to_tensor(image)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'train', 'test'], \
+            "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self._init_url_md5_flag()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, self.data_url, self.data_md5, 'cifar', download)
+
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_data()
+
+    def _init_url_md5_flag(self):
+        self.data_url = CIFAR10_URL
+        self.data_md5 = CIFAR10_MD5
+        self.flag = MODE_FLAG_MAP[self.mode + '10']
+
+    def _load_data(self):
+        self.data = []
+        with tarfile.open(self.data_file, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if self.flag in each_item.name)
+
+            for name in names:
+                if six.PY2:
+                    batch = pickle.load(f.extractfile(name))
+                else:
+                    batch = pickle.load(f.extractfile(name), encoding='bytes')
+
+                data = batch[six.b('data')]
+                labels = batch.get(
+                    six.b('labels'), batch.get(six.b('fine_labels'), None))
+                assert labels is not None
+                for sample, label in six.moves.zip(data, labels):
+                    self.data.append((sample, label))
+
+    def __getitem__(self, idx):
+        image, label = self.data[idx]
+        if self.transform is not None:
+            image = self.transform(image)
+        return image, label
+
+    def __len__(self):
+        return len(self.data)
+
+
+class Cifar100(Cifar10):
+    """
+    Implementation of `Cifar-100 <https://www.cs.toronto.edu/~kriz/cifar.html>`_
+    dataset, which has 100 categories.
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' mode. Default 'train'.
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of cifar-100 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Cifar100
+	    from paddle.incubate.hapi.vision.transforms import Normalize
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+		    self.fc = paddle.nn.Linear(3072, 100, act='softmax')
+
+		def forward(self, image, label):
+		    image = paddle.reshape(image, (3, -1))
+		    return self.fc(image), label
+
+	    paddle.disable_static()
+
+	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
+				std=[0.5, 0.5, 0.5])
+	    cifar100 = Cifar100(mode='train', transform=normalize)
+
+	    for i in range(10):
+		image, label = cifar100[i]
+		image = paddle.to_tensor(image)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        super(Cifar100, self).__init__(data_file, mode, transform, download)
+
+    def _init_url_md5_flag(self):
+        self.data_url = CIFAR100_URL
+        self.data_md5 = CIFAR100_MD5
+        self.flag = MODE_FLAG_MAP[self.mode + '100']
diff --git a/python/paddle/incubate/hapi/datasets/conll05.py b/python/paddle/incubate/hapi/datasets/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..094e3559335363524c4ae893f70294a4afaa7037
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/conll05.py
@@ -0,0 +1,297 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import gzip
+import tarfile
+import numpy as np
+import six
+from six.moves import cPickle as pickle
+
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['Conll05st']
+
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+class Conll05st(Dataset):
+    """
+    Implementation of `Conll05st <https://www.cs.upc.edu/~srlconll/soft.html>`_
+    test dataset.
+
+    Note: only support download test dataset automatically for that
+          only test dataset of Conll05st is public.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        word_dict_file(str): path to word dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        verb_dict_file(str): path to verb dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        target_dict_file(str): path to target dictionary file, can be set None if
+            :attr:`download` is True. Default None
+        emb_file(str): path to embedding dictionary file, only used for
+            :code:`get_embedding` can be set None if :attr:`download` is
+            True. Default None
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file`
+            :attr:`target_dict_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of conll05st dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Conll05st
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, pred_idx, mark, label):
+		    return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)
+
+	    paddle.disable_static()
+
+	    conll05st = Conll05st()
+
+	    for i in range(10):
+		pred_idx, mark, label= conll05st[i][-3:]
+		pred_idx = paddle.to_tensor(pred_idx)
+		mark = paddle.to_tensor(mark)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		pred_idx, mark, label= model(pred_idx, mark, label)
+		print(pred_idx.numpy(), mark.numpy(), label.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 word_dict_file=None,
+                 verb_dict_file=None,
+                 target_dict_file=None,
+                 emb_file=None,
+                 download=True):
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'conll05st', download)
+
+        self.word_dict_file = word_dict_file
+        if self.word_dict_file is None:
+            assert download, "word_dict_file is not set and downloading automatically is disabled"
+            self.word_dict_file = _check_exists_and_download(
+                word_dict_file, WORDDICT_URL, WORDDICT_MD5, 'conll05st',
+                download)
+
+        self.verb_dict_file = verb_dict_file
+        if self.verb_dict_file is None:
+            assert download, "verb_dict_file is not set and downloading automatically is disabled"
+            self.verb_dict_file = _check_exists_and_download(
+                verb_dict_file, VERBDICT_URL, VERBDICT_MD5, 'conll05st',
+                download)
+
+        self.target_dict_file = target_dict_file
+        if self.target_dict_file is None:
+            assert download, "target_dict_file is not set and downloading automatically is disabled"
+            self.target_dict_file = _check_exists_and_download(
+                target_dict_file, TRGDICT_URL, TRGDICT_MD5, 'conll05st',
+                download)
+
+        self.word_dict = self._load_dict(self.word_dict_file)
+        self.predicate_dict = self._load_dict(self.verb_dict_file)
+        self.label_dict = self._load_label_dict(self.target_dict_file)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_label_dict(self, filename):
+        d = dict()
+        tag_dict = set()
+        with open(filename, 'r') as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                if line.startswith("B-"):
+                    tag_dict.add(line[2:])
+                elif line.startswith("I-"):
+                    tag_dict.add(line[2:])
+            index = 0
+            for tag in tag_dict:
+                d["B-" + tag] = index
+                index += 1
+                d["I-" + tag] = index
+                index += 1
+            d["O"] = index
+        return d
+
+    def _load_dict(self, filename):
+        d = dict()
+        with open(filename, 'r') as f:
+            for i, line in enumerate(f):
+                d[line.strip()] = i
+        return d
+
+    def _load_anno(self):
+        tf = tarfile.open(self.data_file)
+        wf = tf.extractfile(
+            "conll05st-release/test.wsj/words/test.wsj.words.gz")
+        pf = tf.extractfile(
+            "conll05st-release/test.wsj/props/test.wsj.props.gz")
+        self.sentences = []
+        self.predicates = []
+        self.labels = []
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in zip(words_file, props_file):
+                word = cpt.to_text(word.strip())
+                label = cpt.to_text(label.strip().split())
+
+                if len(label) == 0:  # end of sentence
+                    for i in range(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            self.sentences.append(sentences)
+                            self.predicates.append(verb_list[i])
+                            self.labels.append(lbl_seq)
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    def __getitem__(self, idx):
+        sentence = self.sentences[idx]
+        predicate = self.predicates[idx]
+        labels = self.labels[idx]
+
+        sen_len = len(sentence)
+
+        verb_index = labels.index('B-V')
+        mark = [0] * len(labels)
+        if verb_index > 0:
+            mark[verb_index - 1] = 1
+            ctx_n1 = sentence[verb_index - 1]
+        else:
+            ctx_n1 = 'bos'
+
+        if verb_index > 1:
+            mark[verb_index - 2] = 1
+            ctx_n2 = sentence[verb_index - 2]
+        else:
+            ctx_n2 = 'bos'
+
+        mark[verb_index] = 1
+        ctx_0 = sentence[verb_index]
+
+        if verb_index < len(labels) - 1:
+            mark[verb_index + 1] = 1
+            ctx_p1 = sentence[verb_index + 1]
+        else:
+            ctx_p1 = 'eos'
+
+        if verb_index < len(labels) - 2:
+            mark[verb_index + 2] = 1
+            ctx_p2 = sentence[verb_index + 2]
+        else:
+            ctx_p2 = 'eos'
+
+        word_idx = [self.word_dict.get(w, UNK_IDX) for w in sentence]
+
+        ctx_n2_idx = [self.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+        ctx_n1_idx = [self.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+        ctx_0_idx = [self.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+        ctx_p1_idx = [self.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+        ctx_p2_idx = [self.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+        pred_idx = [self.predicate_dict.get(predicate)] * sen_len
+        label_idx = [self.label_dict.get(w) for w in labels]
+
+        return (np.array(word_idx), np.array(ctx_n2_idx), np.array(ctx_n1_idx),
+                np.array(ctx_0_idx), np.array(ctx_p1_idx), np.array(ctx_p2_idx),
+                np.array(pred_idx), np.array(mark), np.array(label_idx))
+
+    def __len__(self):
+        return len(self.sentences)
+
+    def get_dict(self):
+        """
+        Get the word, verb and label dictionary of Wikipedia corpus.
+        """
+        return self.word_dict, self.predicate_dict, self.label_dict
+
+    def get_embedding(self):
+        return self.emb_file
diff --git a/python/paddle/incubate/hapi/datasets/flowers.py b/python/paddle/incubate/hapi/datasets/flowers.py
index 6f56cc82c1cba800002d82cc8a2bd5ddae619f9e..141d2a53b577b8c9be9ac153a36c5b2fa51ded77 100644
--- a/python/paddle/incubate/hapi/datasets/flowers.py
+++ b/python/paddle/incubate/hapi/datasets/flowers.py
@@ -36,12 +36,13 @@ SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
 # and trnid is the flag of train data. But test data is more than train data.
 # So we exchange the train data and test data.
-MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': "valid"}
+MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': 'valid'}
 
 
 class Flowers(Dataset):
     """
-    Implement of flowers dataset
+    Implementation of `Flowers <https://www.robots.ox.ac.uk/~vgg/data/flowers/>`_
+    dataset
 
     Args:
         data_file(str): path to data file, can be set None if
@@ -51,9 +52,9 @@ class Flowers(Dataset):
         setid_file(str): path to subset index file, can be set
             None if :attr:`download` is True. Default None
         mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
+        transform(callable): transform to perform on image, None for on transform.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
 
     Examples:
         
@@ -82,19 +83,19 @@ class Flowers(Dataset):
 
         self.data_file = data_file
         if self.data_file is None:
-            assert download, "data_file not set and auto download disabled"
+            assert download, "data_file is not set and downloading automatically is disabled"
             self.data_file = _check_exists_and_download(
                 data_file, DATA_URL, DATA_MD5, 'flowers', download)
 
         self.label_file = label_file
         if self.label_file is None:
-            assert download, "label_file not set and auto download disabled"
+            assert download, "label_file is not set and downloading automatically is disabled"
             self.label_file = _check_exists_and_download(
                 label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
 
         self.setid_file = setid_file
         if self.setid_file is None:
-            assert download, "setid_file not set and auto download disabled"
+            assert download, "setid_file is not set and downloading automatically is disabled"
             self.setid_file = _check_exists_and_download(
                 setid_file, SETID_URL, SETID_MD5, 'flowers', download)
 
diff --git a/python/paddle/incubate/hapi/datasets/imdb.py b/python/paddle/incubate/hapi/datasets/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d166bc784a382ac5ae70491d3e8061ad1d1e9f
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/imdb.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import re
+import six
+import string
+import tarfile
+import numpy as np
+import collections
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Imdb']
+
+URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+class Imdb(Dataset):
+    """
+    Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' 'test' mode. Default 'train'.
+        cutoff(int): cutoff number for building word dictionary. Default 150.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of IMDB dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Imdb
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, doc, label):
+		    return paddle.sum(doc), label
+
+	    paddle.disable_static()
+
+	    imdb = Imdb(mode='train')
+
+	    for i in range(10):
+		doc, label = imdb[i]
+		doc = paddle.to_tensor(doc)
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label = model(doc, label)
+		print(doc.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self, data_file=None, mode='train', cutoff=150, download=True):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'imdb', download)
+
+        # Build a word dictionary from the corpus
+        self.word_idx = self._build_work_dict(cutoff)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _build_work_dict(self, cutoff):
+        word_freq = collections.defaultdict(int)
+        pattern = re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
+        for doc in self._tokenize(pattern):
+            for word in doc:
+                word_freq[word] += 1
+
+        # Not sure if we should prune less-frequent words here.
+        word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
+
+        dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*dictionary))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+        word_idx['<unk>'] = len(words)
+        return word_idx
+
+    def _tokenize(self, pattern):
+        data = []
+        with tarfile.open(self.data_file) as tarf:
+            tf = tarf.next()
+            while tf != None:
+                if bool(pattern.match(tf.name)):
+                    # newline and punctuations removal and ad-hoc tokenization.
+                    data.append(
+                        tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
+                        .translate(None, six.b(string.punctuation)).lower(
+                        ).split())
+                tf = tarf.next()
+
+        return data
+
+    def _load_anno(self):
+        pos_pattern = re.compile("aclImdb/{}/pos/.*\.txt$".format(self.mode))
+        neg_pattern = re.compile("aclImdb/{}/neg/.*\.txt$".format(self.mode))
+
+        UNK = self.word_idx['<unk>']
+
+        self.docs = []
+        self.labels = []
+        for doc in self._tokenize(pos_pattern):
+            self.docs.append([self.word_idx.get(w, UNK) for w in doc])
+            self.labels.append(0)
+        for doc in self._tokenize(neg_pattern):
+            self.docs.append([self.word_idx.get(w, UNK) for w in doc])
+            self.labels.append(1)
+
+    def __getitem__(self, idx):
+        return (np.array(self.docs[idx]), np.array([self.labels[idx]]))
+
+    def __len__(self):
+        return len(self.docs)
diff --git a/python/paddle/incubate/hapi/datasets/imikolov.py b/python/paddle/incubate/hapi/datasets/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6ad43b506265ee8c9c8617a87eba5a041632bd
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/imikolov.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+import tarfile
+import numpy as np
+import collections
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ['Imikolov']
+
+URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class Imikolov(Dataset):
+    """
+    Implementation of imikolov dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
+        window_size(int): sliding window size for 'NGRAM' data. Default -1.
+        mode(str): 'train' 'test' mode. Default 'train'.
+        min_word_freq(int): minimal word frequence for building word dictionary. Default 50.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of imikolov dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Imikolov
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src, trg):
+		    return paddle.sum(src), paddle.sum(trg)
+
+	    paddle.disable_static()
+
+	    imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)
+
+	    for i in range(10):
+		src, trg = imikolov[i]
+		src = paddle.to_tensor(src)
+		trg = paddle.to_tensor(trg)
+
+		model = SimpleNet()
+		src, trg = model(src, trg)
+		print(src.numpy().shape, trg.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 data_type='NGRAM',
+                 window_size=-1,
+                 mode='train',
+                 min_word_freq=50,
+                 download=True):
+        assert data_type.upper() in ['NGRAM', 'SEQ'], \
+            "data type should be 'NGRAM', 'SEQ', but got {}".format(data_type)
+        self.data_type = data_type.upper()
+
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.window_size = window_size
+        self.min_word_freq = min_word_freq
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'imikolov', download)
+
+        # Build a word dictionary from the corpus
+        self.word_idx = self._build_work_dict(min_word_freq)
+
+        # read dataset into memory
+        self._load_anno()
+
+    def word_count(self, f, word_freq=None):
+        if word_freq is None:
+            word_freq = collections.defaultdict(int)
+
+        for l in f:
+            for w in l.strip().split():
+                word_freq[w] += 1
+            word_freq['<s>'] += 1
+            word_freq['<e>'] += 1
+
+        return word_freq
+
+    def _build_work_dict(self, cutoff):
+        train_filename = './simple-examples/data/ptb.train.txt'
+        test_filename = './simple-examples/data/ptb.valid.txt'
+        with tarfile.open(self.data_file) as tf:
+            trainf = tf.extractfile(train_filename)
+            testf = tf.extractfile(test_filename)
+            word_freq = self.word_count(testf, self.word_count(trainf))
+            if '<unk>' in word_freq:
+                # remove <unk> for now, since we will set it as last index
+                del word_freq['<unk>']
+
+            word_freq = [
+                x for x in six.iteritems(word_freq) if x[1] > self.min_word_freq
+            ]
+
+            word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+            words, _ = list(zip(*word_freq_sorted))
+            word_idx = dict(list(zip(words, six.moves.range(len(words)))))
+            word_idx['<unk>'] = len(words)
+
+        return word_idx
+
+    def _load_anno(self):
+        self.data = []
+        with tarfile.open(self.data_file) as tf:
+            filename = './simple-examples/data/ptb.{}.txt'.format(self.mode)
+            f = tf.extractfile(filename)
+
+            UNK = self.word_idx['<unk>']
+            for l in f:
+                if self.data_type == 'NGRAM':
+                    assert self.window_size > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= self.window_size:
+                        l = [self.word_idx.get(w, UNK) for w in l]
+                        for i in six.moves.range(self.window_size, len(l) + 1):
+                            self.data.append(tuple(l[i - self.window_size:i]))
+                elif self.data_type == 'SEQ':
+                    l = l.strip().split()
+                    l = [self.word_idx.get(w, UNK) for w in l]
+                    src_seq = [self.word_idx['<s>']] + l
+                    trg_seq = l + [self.word_idx['<e>']]
+                    if self.window_size > 0 and len(src_seq) > self.window_size:
+                        continue
+                    self.data.append((src_seq, trg_seq))
+                else:
+                    assert False, 'Unknow data type'
+
+    def __getitem__(self, idx):
+        return tuple([np.array(d) for d in self.data[idx]])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/mnist.py b/python/paddle/incubate/hapi/datasets/mnist.py
index bd48ca1c9668b40ac0379bfeda11a5c056f9fd44..ed046e5a1d9bbcc33f3148c6ecde8a349e478cb0 100644
--- a/python/paddle/incubate/hapi/datasets/mnist.py
+++ b/python/paddle/incubate/hapi/datasets/mnist.py
@@ -38,7 +38,7 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 
 class MNIST(Dataset):
     """
-    Implement of MNIST dataset
+    Implementation of `MNIST <http://yann.lecun.com/exdb/mnist/>`_ dataset
 
     Args:
         image_path(str): path to image file, can be set None if
@@ -48,9 +48,8 @@ class MNIST(Dataset):
         chw_format(bool): If set True, the output shape is [1, 28, 28],
             otherwise, output shape is [1, 784]. Default True.
         mode(str): 'train' or 'test' mode. Default 'train'.
-        download(bool): whether auto download mnist dataset if
-            :attr:`image_path`/:attr:`label_path` unset. Default
-            True
+        download(bool): whether to download dataset automatically if
+            :attr:`image_path` :attr:`label_path` is not set. Default True
 
     Returns:
         Dataset: MNIST Dataset.
@@ -82,7 +81,7 @@ class MNIST(Dataset):
         self.chw_format = chw_format
         self.image_path = image_path
         if self.image_path is None:
-            assert download, "image_path not set and auto download disabled"
+            assert download, "image_path is not set and downloading automatically is disabled"
             image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
             image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
             self.image_path = _check_exists_and_download(
@@ -90,9 +89,9 @@ class MNIST(Dataset):
 
         self.label_path = label_path
         if self.label_path is None:
-            assert download, "label_path not set and auto download disabled"
-            label_url = TRAIN_LABEL_URL if mode == 'train' else TEST_LABEL_URL
-            label_md5 = TRAIN_LABEL_MD5 if mode == 'train' else TEST_LABEL_MD5
+            assert download, "label_path is not set and downloading automatically is disabled"
+            label_url = TRAIN_LABEL_URL if self.mode == 'train' else TEST_LABEL_URL
+            label_md5 = TRAIN_LABEL_MD5 if self.mode == 'train' else TEST_LABEL_MD5
             self.label_path = _check_exists_and_download(
                 label_path, label_url, label_md5, 'mnist', download)
 
diff --git a/python/paddle/incubate/hapi/datasets/movie_reviews.py b/python/paddle/incubate/hapi/datasets/movie_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf0684ebcd315807b9dc736c5481383073e5ba8
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/movie_reviews.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import numpy as np
+import collections
+import nltk
+from nltk.corpus import movie_reviews
+import zipfile
+from functools import cmp_to_key
+from itertools import chain
+
+import paddle
+from paddle.io import Dataset
+
+__all__ = ['MovieReviews']
+
+URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
+MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
+
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+class MovieReviews(Dataset):
+    """
+    Implementation of `NLTK movie reviews <http://www.nltk.org/nltk_data/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' 'test' mode. Default 'train'.
+        download(bool): whether auto download cifar dataset if
+            :attr:`data_file` unset. Default True.
+
+    Returns:
+        Dataset: instance of movie reviews dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import MovieReviews
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, word, category):
+		    return paddle.sum(word), category
+
+	    paddle.disable_static()
+
+	    movie_reviews = MovieReviews(mode='train')
+
+	    for i in range(10):
+		word_list, category = movie_reviews[i]
+		word_list = paddle.to_tensor(word_list)
+		category = paddle.to_tensor(category)
+
+		model = SimpleNet()
+		word_list, category = model(word_list, category)
+		print(word_list.numpy().shape, category.numpy())
+
+    """
+
+    def __init__(self, mode='train'):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self._download_data_if_not_yet()
+
+        # read dataset into memory
+        self._load_sentiment_data()
+
+    def _get_word_dict(self):
+        """
+	Sorted the words by the frequency of words which occur in sample
+	:return:
+	    words_freq_sorted
+	"""
+        words_freq_sorted = list()
+        word_freq_dict = collections.defaultdict(int)
+
+        for category in movie_reviews.categories():
+            for field in movie_reviews.fileids(category):
+                for words in movie_reviews.words(field):
+                    word_freq_dict[words] += 1
+        words_sort_list = list(six.iteritems(word_freq_dict))
+        words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
+        for index, word in enumerate(words_sort_list):
+            words_freq_sorted.append((word[0], index))
+        return words_freq_sorted
+
+    def _sort_files(self):
+        """
+	Sorted the sample for cross reading the sample
+	:return:
+	    files_list
+	"""
+        files_list = list()
+        neg_file_list = movie_reviews.fileids('neg')
+        pos_file_list = movie_reviews.fileids('pos')
+        files_list = list(
+            chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
+        return files_list
+
+    def _load_sentiment_data(self):
+        """
+	Load the data set
+	:return:
+	    data_set
+	"""
+        self.data = []
+        words_ids = dict(self._get_word_dict())
+        for sample_file in self._sort_files():
+            words_list = list()
+            category = 0 if 'neg' in sample_file else 1
+            for word in movie_reviews.words(sample_file):
+                words_list.append(words_ids[word.lower()])
+            self.data.append((words_list, category))
+
+    def _download_data_if_not_yet(self):
+        """
+	Download the data set, if the data set is not download.
+	"""
+        try:
+            # download and extract movie_reviews.zip
+            paddle.dataset.common.download(
+                URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
+            path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
+            filename = os.path.join(path, 'movie_reviews.zip')
+            zip_file = zipfile.ZipFile(filename)
+            zip_file.extractall(path)
+            zip_file.close()
+            # make sure that nltk can find the data
+            if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+                nltk.data.path.append(paddle.dataset.common.DATA_HOME)
+            movie_reviews.categories()
+        except LookupError:
+            print("Downloading movie_reviews data set, please wait.....")
+            nltk.download(
+                'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+            print("Download data set success.....")
+            print("Path is " + nltk.data.find('corpora/movie_reviews').path)
+
+    def __getitem__(self, idx):
+        if self.mode == 'test':
+            idx += NUM_TRAINING_INSTANCES
+        data = self.data[idx]
+        return np.array(data[0]), np.array(data[1])
+
+    def __len__(self):
+        if self.mode == 'train':
+            return NUM_TRAINING_INSTANCES
+        else:
+            return NUM_TOTAL_INSTANCES - NUM_TRAINING_INSTANCES
diff --git a/python/paddle/incubate/hapi/datasets/movielens.py b/python/paddle/incubate/hapi/datasets/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..228e9dc6d477cf539683963dc6ddaa3c02c8fe95
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/movielens.py
@@ -0,0 +1,219 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import zipfile
+import re
+import random
+import functools
+import six
+
+import paddle
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['Movielens']
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'https://dataset.bj.bcebos.com/movielens%2Fml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self, categories_dict, movie_title_dict):
+        """
+        Get information from a movie.
+        """
+        return [[self.index], [categories_dict[c] for c in self.categories],
+                [movie_title_dict[w.lower()] for w in self.title.split()]]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [[self.index], [0 if self.is_male else 1], [self.age],
+                [self.job_id]]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+class Movielens(Dataset):
+    """
+    Implementation of `Movielens 1-M <https://grouplens.org/datasets/movielens/1m/>`_ dataset.
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        test_ratio(float): split ratio for test sample. Default 0.1.
+        rand_seed(int): random seed. Default 0.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of Movielens 1-M dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import Movielens
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, category, title, rating):
+		    return paddle.sum(category), paddle.sum(title), paddle.sum(rating)
+
+	    paddle.disable_static()
+
+	    movielens = Movielens(mode='train')
+
+	    for i in range(10):
+		category, title, rating = movielens[i][-3:]
+		category = paddle.to_tensor(category)
+		title = paddle.to_tensor(title)
+		rating = paddle.to_tensor(rating)
+
+		model = SimpleNet()
+		category, title, rating = model(category, title, rating)
+		print(category.numpy().shape, title.numpy().shape, rating.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 test_ratio=0.1,
+                 rand_seed=0,
+                 download=True):
+        assert mode.lower() in ['train', 'test'], \
+            "mode should be 'train', 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'sentiment', download)
+
+        self.test_ratio = test_ratio
+        self.rand_seed = rand_seed
+
+        np.random.seed(rand_seed)
+        self._load_meta_info()
+        self._load_data()
+
+    def _load_meta_info(self):
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        self.movie_info = dict()
+        self.movie_title_dict = dict()
+        self.categories_dict = dict()
+        self.user_info = dict()
+        with zipfile.ZipFile(self.data_file) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        self.movie_info[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                for i, w in enumerate(title_word_set):
+                    self.movie_title_dict[w] = i
+
+                for i, c in enumerate(categories_set):
+                    self.categories_dict[c] = i
+
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        self.user_info[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+
+    def _load_data(self):
+        self.data = []
+        is_test = self.mode == 'test'
+        with zipfile.ZipFile(self.data_file) as package:
+            with package.open('ml-1m/ratings.dat') as rating:
+                for line in rating:
+                    line = cpt.to_text(line, encoding='latin')
+                    if (np.random.random() < self.test_ratio) == is_test:
+                        uid, mov_id, rating, _ = line.strip().split("::")
+                        uid = int(uid)
+                        mov_id = int(mov_id)
+                        rating = float(rating) * 2 - 5.0
+
+                        mov = self.movie_info[mov_id]
+                        usr = self.user_info[uid]
+                        self.data.append(usr.value() + \
+                                         mov.value(self.categories_dict, self.movie_title_dict) + \
+                                         [[rating]])
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        return tuple([np.array(d) for d in data])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/uci_housing.py b/python/paddle/incubate/hapi/datasets/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f2c4a5bb5d9d60ba1316e3e2a5f174df94fe99
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/uci_housing.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import six
+import numpy as np
+
+import paddle.dataset.common
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["UCIHousing"]
+
+URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT'
+]
+
+
+class UCIHousing(Dataset):
+    """
+    Implementation of `UCI housing <https://archive.ics.uci.edu/ml/datasets/Housing>`_
+    dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of UCI housing dataset.
+
+    Examples:
+        
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import UCIHousing
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, feature, target):
+		    return paddle.sum(feature), target
+
+	    paddle.disable_static()
+
+	    uci_housing = UCIHousing(mode='train')
+
+	    for i in range(10):
+		feature, target = uci_housing[i]
+		feature = paddle.to_tensor(feature)
+		target = paddle.to_tensor(target)
+
+		model = SimpleNet()
+		feature, target = model(feature, target)
+		print(feature.numpy().shape, target.numpy())
+
+    """
+
+    def __init__(self, data_file=None, mode='train', download=True):
+        assert mode.lower() in ['train', 'test'], \
+                "mode should be 'train' or 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(data_file, URL, MD5,
+                                                        'uci_housing', download)
+
+        # read dataset into memory
+        self._load_data()
+
+    def _load_data(self, feature_num=14, ratio=0.8):
+        data = np.fromfile(self.data_file, sep=' ')
+        data = data.reshape(data.shape[0] // feature_num, feature_num)
+        maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+            axis=0) / data.shape[0]
+        for i in six.moves.range(feature_num - 1):
+            data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+        offset = int(data.shape[0] * ratio)
+        if self.mode == 'train':
+            self.data = data[:offset]
+        elif self.mode == 'test':
+            self.data = data[offset:]
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        return np.array(data[:-1]), np.array(data[-1:])
+
+    def __len__(self):
+        return len(self.data)
diff --git a/python/paddle/incubate/hapi/datasets/voc2012.py b/python/paddle/incubate/hapi/datasets/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..1811c455db530710a0559c077975ab08d6a94ac3
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/voc2012.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import tarfile
+import numpy as np
+from PIL import Image
+
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+
+__all__ = ["VOC2012"]
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+MODE_FLAG_MAP = {'train': 'trainval', 'test': 'train', 'valid': "val"}
+
+
+class VOC2012(Dataset):
+    """
+    Implementation of `VOC2012 <http://host.robots.ox.ac.uk/pascal/VOC/voc2012/>`_ dataset
+
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import VOC2012
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, image, label):
+		    return paddle.sum(image), label
+
+	    paddle.disable_static()
+
+	    voc2012 = VOC2012(mode='train')
+
+	    for i in range(10):
+		image, label= voc2012[i]
+		image = paddle.cast(paddle.to_tensor(image), 'float32')
+		label = paddle.to_tensor(label)
+
+		model = SimpleNet()
+		image, label= model(image, label)
+		print(image.numpy().shape, label.numpy().shape)
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'valid', 'test'], \
+            "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+        self.flag = MODE_FLAG_MAP[mode.lower()]
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, VOC_URL, VOC_MD5, CACHE_DIR, download)
+        self.transform = transform
+
+        # read dataset into memory
+        self._load_anno()
+
+    def _load_anno(self):
+        self.name2mem = {}
+        self.data_tar = tarfile.open(self.data_file)
+        for ele in self.data_tar.getmembers():
+            self.name2mem[ele.name] = ele
+
+        set_file = SET_FILE.format(self.flag)
+        sets = self.data_tar.extractfile(self.name2mem[set_file])
+
+        self.data = []
+        self.labels = []
+
+        for line in sets:
+            line = line.strip()
+            data = DATA_FILE.format(line.decode('utf-8'))
+            label = LABEL_FILE.format(line.decode('utf-8'))
+            self.data.append(data)
+            self.labels.append(label)
+
+    def __getitem__(self, idx):
+        data_file = self.data[idx]
+        label_file = self.labels[idx]
+
+        data = self.data_tar.extractfile(self.name2mem[data_file]).read()
+        label = self.data_tar.extractfile(self.name2mem[label_file]).read()
+        data = Image.open(io.BytesIO(data))
+        label = Image.open(io.BytesIO(label))
+        data = np.array(data)
+        label = np.array(label)
+        if self.transform is not None:
+            data = self.transform(data)
+        return data, label
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        if self.data_tar:
+            self.data_tar.close()
diff --git a/python/paddle/incubate/hapi/datasets/wmt14.py b/python/paddle/incubate/hapi/datasets/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..b495ea931a80425b8e24b81cdf8fdfd2c0920a3e
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/wmt14.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import tarfile
+import numpy as np
+import gzip
+
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['WMT14']
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+class WMT14(Dataset):
+    """
+    Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
+    The original WMT14 dataset is too large and a small set of data for set is
+    provided. This module will download dataset from
+    http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' or 'gen'. Default 'train'
+        dict_size(int): word dictionary size. Default -1.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of WMT14 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import WMT14
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src_ids, trg_ids, trg_ids_next):
+		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+
+	    paddle.disable_static()
+
+	    wmt14 = WMT14(mode='train', dict_size=50)
+
+	    for i in range(10):
+		src_ids, trg_ids, trg_ids_next = wmt14[i]
+		src_ids = paddle.to_tensor(src_ids)
+		trg_ids = paddle.to_tensor(trg_ids)
+		trg_ids_next = paddle.to_tensor(trg_ids_next)
+
+		model = SimpleNet()
+		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 dict_size=-1,
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'gen'], \
+            "mode should be 'train', 'test' or 'gen', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, URL_TRAIN, MD5_TRAIN, 'wmt14', download)
+
+        # read dataset into memory
+        assert dict_size > 0, "dict_size should be set as positive number"
+        self.dict_size = dict_size
+        self._load_data()
+
+    def _load_data(self):
+        def __to_dict(fd, size):
+            out_dict = dict()
+            for line_count, line in enumerate(fd):
+                if line_count < size:
+                    out_dict[cpt.to_text(line.strip())] = line_count
+                else:
+                    break
+            return out_dict
+
+        self.src_ids = []
+        self.trg_ids = []
+        self.trg_ids_next = []
+        with tarfile.open(self.data_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith("src.dict")
+            ]
+            assert len(names) == 1
+            self.src_dict = __to_dict(f.extractfile(names[0]), self.dict_size)
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith("trg.dict")
+            ]
+            assert len(names) == 1
+            self.trg_dict = __to_dict(f.extractfile(names[0]), self.dict_size)
+
+            file_name = "{}/{}".format(self.mode, self.mode)
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line = cpt.to_text(line)
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        self.src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [self.trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [self.trg_dict[END]]
+                    trg_ids = [self.trg_dict[START]] + trg_ids
+
+                    self.src_ids.append(src_ids)
+                    self.trg_ids.append(trg_ids)
+                    self.trg_ids_next.append(trg_ids_next)
+
+    def __getitem__(self, idx):
+        return (np.array(self.src_ids[idx]), np.array(self.trg_ids[idx]),
+                np.array(self.trg_ids_next[idx]))
+
+    def __len__(self):
+        return len(self.src_ids)
+
+    def get_dict(self, reverse=False):
+        if reverse:
+            src_dict = {v: k for k, v in six.iteritems(src_dict)}
+            trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
+        return src_dict, trg_dict
diff --git a/python/paddle/incubate/hapi/datasets/wmt16.py b/python/paddle/incubate/hapi/datasets/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3cb8bfacadd15f6c0f973a09dbf544bbc396c0
--- /dev/null
+++ b/python/paddle/incubate/hapi/datasets/wmt16.py
@@ -0,0 +1,247 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+"""
+
+from __future__ import print_function
+
+import os
+import six
+import tarfile
+import numpy as np
+from collections import defaultdict
+
+import paddle
+from paddle.io import Dataset
+import paddle.compat as cpt
+from .utils import _check_exists_and_download
+
+__all__ = ['WMT16']
+
+DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+class WMT16(Dataset):
+    """
+    Implementation of `WMT16 <http://www.statmt.org/wmt16/>`_ test dataset.
+    ACL2016 Multimodal Machine Translation. Please see this website for more
+    details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+    If you use the dataset created for your task, please cite the following paper:
+    Multi30K: Multilingual English-German Image Descriptions.
+
+    .. code-block:: text
+
+        @article{elliott-EtAl:2016:VL16,
+         author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+         title     = {Multi30K: Multilingual English-German Image Descriptions},
+         booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+         year      = {2016},
+         pages     = {70--74},
+         year      = 2016
+        }
+
+    Args:
+        data_file(str): path to data tar file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train', 'test' or 'val'. Default 'train'
+        src_dict_size(int): word dictionary size for source language word. Default -1.
+        trg_dict_size(int): word dictionary size for target language word. Default -1.
+        lang(str): source language, 'en' or 'de'. Default 'en'.
+        download(bool): whether to download dataset automatically if
+            :attr:`data_file` is not set. Default True
+
+    Returns:
+        Dataset: instance of WMT16 dataset
+
+    Examples:
+
+        .. code-block:: python
+
+	    import paddle
+	    from paddle.incubate.hapi.datasets import WMT16
+
+	    class SimpleNet(paddle.nn.Layer):
+		def __init__(self):
+		    super(SimpleNet, self).__init__()
+
+		def forward(self, src_ids, trg_ids, trg_ids_next):
+		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+
+	    paddle.disable_static()
+
+	    wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
+
+	    for i in range(10):
+		src_ids, trg_ids, trg_ids_next = wmt16[i]
+		src_ids = paddle.to_tensor(src_ids)
+		trg_ids = paddle.to_tensor(trg_ids)
+		trg_ids_next = paddle.to_tensor(trg_ids_next)
+
+		model = SimpleNet()
+		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+
+    """
+
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 src_dict_size=-1,
+                 trg_dict_size=-1,
+                 lang='en',
+                 download=True):
+        assert mode.lower() in ['train', 'test', 'val'], \
+            "mode should be 'train', 'test' or 'val', but got {}".format(mode)
+        self.mode = mode.lower()
+
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'wmt16', download)
+
+        self.lang = lang
+        assert src_dict_size > 0, "dict_size should be set as positive number"
+        assert trg_dict_size > 0, "dict_size should be set as positive number"
+        self.src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if lang == "en"
+                                                 else TOTAL_DE_WORDS))
+        self.trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if lang == "en"
+                                                 else TOTAL_EN_WORDS))
+
+        # load source and target word dict
+        self.src_dict = self._load_dict(lang, src_dict_size)
+        self.trg_dict = self._load_dict("de" if lang == "en" else "en",
+                                        trg_dict_size)
+
+        # load data
+        self.data = self._load_data()
+
+    def _load_dict(self, lang, dict_size, reverse=False):
+        dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                                 "wmt16/%s_%d.dict" % (lang, dict_size))
+        dict_found = False
+        if os.path.exists(dict_path):
+            with open(dict_path, "rb") as d:
+                dict_found = len(d.readlines()) == dict_size
+        if not dict_found:
+            self._build_dict(dict_path, dict_size, lang)
+
+        word_dict = {}
+        with open(dict_path, "rb") as fdict:
+            for idx, line in enumerate(fdict):
+                if reverse:
+                    word_dict[idx] = cpt.to_text(line.strip())
+                else:
+                    word_dict[cpt.to_text(line.strip())] = idx
+        return word_dict
+
+    def _build_dict(self, dict_path, dict_size, lang):
+        word_dict = defaultdict(int)
+        with tarfile.open(self.data_file, mode="r") as f:
+            for line in f.extractfile("wmt16/train"):
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2: continue
+                sen = line_split[0] if self.lang == "en" else line_split[1]
+                for w in sen.split():
+                    word_dict[w] += 1
+
+        with open(dict_path, "wb") as fout:
+            fout.write(
+                cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
+            for idx, word in enumerate(
+                    sorted(
+                        six.iteritems(word_dict),
+                        key=lambda x: x[1],
+                        reverse=True)):
+                if idx + 3 == dict_size: break
+                fout.write(cpt.to_bytes(word[0]))
+                fout.write(cpt.to_bytes('\n'))
+
+    def _load_data(self):
+        # the index for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = self.src_dict[START_MARK]
+        end_id = self.src_dict[END_MARK]
+        unk_id = self.src_dict[UNK_MARK]
+
+        src_col = 0 if self.lang == "en" else 1
+        trg_col = 1 - src_col
+
+        self.src_ids = []
+        self.trg_ids = []
+        self.trg_ids_next = []
+        with tarfile.open(self.data_file, mode="r") as f:
+            for line in f.extractfile("wmt16/{}".format(self.mode)):
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    self.src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [self.trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                self.src_ids.append(src_ids)
+                self.trg_ids.append(trg_ids)
+                self.trg_ids_next.append(trg_ids_next)
+
+    def __getitem__(self, idx):
+        return (np.array(self.src_ids[idx]), np.array(self.trg_ids[idx]),
+                np.array(self.trg_ids_next[idx]))
+
+    def __len__(self):
+        return len(self.src_ids)
+
+    def get_dict(self, lang, reverse=False):
+        """
+	return the word dictionary for the specified language.
+
+	Args:
+	    lang(string): A string indicating which language is the source
+			  language. Available options are: "en" for English
+			  and "de" for Germany.
+	    reverse(bool): If reverse is set to False, the returned python
+			   dictionary will use word as key and use index as value.
+			   If reverse is set to True, the returned python
+			   dictionary will use index as key and word as value.
+
+	Returns:
+	    dict: The word dictionary for the specific language.
+	"""
+
+        dict_size = self.src_dict_size if lang == self.lang else self.trg_dict_size
+
+        dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                                 "wmt16/%s_%d.dict" % (lang, dict_size))
+        assert os.path.exists(dict_path), "Word dictionary does not exist. "
+        "Please invoke paddle.dataset.wmt16.train/test/validation first "
+        "to build the dictionary."
+        return _load_dict(lang, dict_size)
diff --git a/python/paddle/incubate/hapi/distributed.py b/python/paddle/incubate/hapi/distributed.py
index 585f466ea6a1ef5a3d888b7c46fe2908ffd2c769..0e38dc8edc758e9c1b8a96add1df242fb0aecef1 100644
--- a/python/paddle/incubate/hapi/distributed.py
+++ b/python/paddle/incubate/hapi/distributed.py
@@ -49,6 +49,13 @@ class DistributedBatchSampler(BatchSampler):
                      `__len__` for BatchSampler to get sample
                      number of data source.
         batch_size(int): sample indice number in a mini-batch indices.
+        num_replicas(int, optional): porcess number in distributed training.
+            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
+            Default None.
+        rank(int, optional): the rank of the current process among :attr:`num_replicas`
+            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
         shuffle(bool): whther to shuffle indices order before genrating
             batch indices. Default False.
         drop_last(bool): whether drop the last incomplete batch dataset size
@@ -84,7 +91,13 @@ class DistributedBatchSampler(BatchSampler):
                 break
     """
 
-    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False):
         self.dataset = dataset
 
         assert isinstance(batch_size, int) and batch_size > 0, \
@@ -96,9 +109,21 @@ class DistributedBatchSampler(BatchSampler):
         assert isinstance(drop_last, bool), \
                 "drop_last should be a boolean number"
 
+        if num_replicas is not None:
+            assert isinstance(num_replicas, int) and num_replicas > 0, \
+                    "num_replicas should be a positive integer"
+            self.nranks = num_replicas
+        else:
+            self.nranks = ParallelEnv().nranks
+
+        if rank is not None:
+            assert isinstance(rank, int) and rank >= 0, \
+                    "rank should be a non-negative integer"
+            self.local_rank = rank
+        else:
+            self.local_rank = ParallelEnv().local_rank
+
         self.drop_last = drop_last
-        self.nranks = ParallelEnv().nranks
-        self.local_rank = ParallelEnv().local_rank
         self.epoch = 0
         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
         self.total_size = self.num_samples * self.nranks
diff --git a/python/paddle/incubate/hapi/metrics.py b/python/paddle/incubate/hapi/metrics.py
deleted file mode 100644
index 9e9a2e78524022d7de8ca80a7fb8e3c478dacd36..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/metrics.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import abc
-import numpy as np
-import paddle.fluid as fluid
-
-import logging
-
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-__all__ = ['Metric', 'Accuracy']
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Metric(object):
-    """
-    Base class for metric, encapsulates metric logic and APIs
-    Usage:
-        
-        m = SomeMetric()
-        for prediction, label in ...:
-            m.update(prediction, label)
-        m.accumulate()
-        
-    Advanced usage for :code:`add_metric_op`
-    Metric calculation can be accelerated by calculating metric states
-    from model outputs and labels by Paddle OPs in :code:`add_metric_op`,
-    metric states will be fetch as numpy array and call :code:`update`
-    with states in numpy format.
-    Metric calculated as follows (operations in Model and Metric are
-    indicated with curly brackets, while data nodes not):
-                 inputs & labels              || ------------------
-                       |                      ||
-                    {model}                   ||
-                       |                      ||
-                outputs & labels              ||
-                       |                      ||    tensor data
-             {Metric.add_metric_op}           ||
-                       |                      ||
-              metric states(tensor)           ||
-                       |                      ||
-                {fetch as numpy}              || ------------------
-                       |                      ||
-              metric states(numpy)            ||    numpy data
-                       |                      ||
-                {Metric.update}               \/ ------------------
-    Examples:
-        
-        For :code:`Accuracy` metric, which takes :code:`pred` and :code:`label`
-        as inputs, we can calculate the correct prediction matrix between
-        :code:`pred` and :code:`label` in :code:`add_metric_op`.
-        For examples, prediction results contains 10 classes, while :code:`pred`
-        shape is [N, 10], :code:`label` shape is [N, 1], N is mini-batch size,
-        and we only need to calculate accurary of top-1 and top-5, we could
-        calculated the correct prediction matrix of the top-5 scores of the
-        prediction of each sample like follows, while the correct prediction
-        matrix shape is [N, 5].
-        .. code-block:: python
-            def add_metric_op(pred, label):
-                # sort prediction and slice the top-5 scores
-                pred = fluid.layers.argsort(pred, descending=True)[1][:, :5]
-                # calculate whether the predictions are correct
-                correct = pred == label
-                return fluid.layers.cast(correct, dtype='float32')
-        With the :code:`add_metric_op`, we split some calculations to OPs(which
-        may run on GPU devices, will be faster), and only fetch 1 tensor with
-        shape as [N, 5] instead of 2 tensors with shapes as [N, 10] and [N, 1].
-        :code:`update` can be define as follows:
-        .. code-block:: python
-            def update(self, correct):
-                accs = []
-                for i, k in enumerate(self.topk):
-                    num_corrects = correct[:, :k].sum()
-                    num_samples = len(correct)
-                    accs.append(float(num_corrects) / num_samples)
-                    self.total[i] += num_corrects
-                    self.count[i] += num_samples
-                return accs
-    """
-
-    def __init__(self):
-        pass
-
-    @abc.abstractmethod
-    def reset(self):
-        """
-        Reset states and result
-        """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def update(self, *args):
-        """
-        Update states for metric
-
-        Inputs of :code:`update` is the outputs of :code:`Metric.add_metric_op`,
-        if :code:`add_metric_op` is not defined, the inputs of :code:`update`
-        will be flatten arguments of **output** of mode and **label** from data:
-        :code:`update(output1, output2, ..., label1, label2,...)`
-
-        see :code:`Metric.add_metric_op`
-        """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    @abc.abstractmethod
-    def accumulate(self):
-        """
-        Accumulates statistics, computes and returns the metric value
-        """
-        raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__))
-
-    @abc.abstractmethod
-    def name(self):
-        """
-        Returns metric name
-        """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
-
-    def add_metric_op(self, *args):
-        """
-        This API is advanced usage to accelerate metric calculating, calulations
-        from outputs of model to the states which should be updated by Metric can
-        be defined here, where Paddle OPs is also supported. Outputs of this API
-        will be the inputs of "Metric.update".
-
-        If :code:`add_metric_op` is defined, it will be called with **outputs**
-        of model and **labels** from data as arguments, all outputs and labels
-        will be concatenated and flatten and each filed as a separate argument
-        as follows:
-        :code:`add_metric_op(output1, output2, ..., label1, label2,...)`
-
-        If :code:`add_metric_op` is not defined, default behaviour is to pass
-        input to output, so output format will be:
-        :code:`return output1, output2, ..., label1, label2,...`
-
-        see :code:`Metric.update`
-        """
-        return args
-
-
-class Accuracy(Metric):
-    """
-    Encapsulates accuracy metric logic
-
-    Examples:
-        
-        .. code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import paddle.incubate.hapi as hapi
-
-        fluid.enable_dygraph()
-
-        train_dataset = hapi.datasets.MNIST(mode='train')
-
-        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
-        optim = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=model.parameters())
-        model.prepare(
-            optim,
-            loss_function=paddle.nn.CrossEntropyLoss(),
-            metrics=hapi.metrics.Accuracy())
-
-        model.fit(train_dataset, batch_size=64)
-
-    """
-
-    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
-        super(Accuracy, self).__init__(*args, **kwargs)
-        self.topk = topk
-        self.maxk = max(topk)
-        self._init_name(name)
-        self.reset()
-
-    def add_metric_op(self, pred, label, *args):
-        pred = fluid.layers.argsort(pred, descending=True)[1][:, :self.maxk]
-        correct = pred == label
-        return fluid.layers.cast(correct, dtype='float32')
-
-    def update(self, correct, *args):
-        accs = []
-        for i, k in enumerate(self.topk):
-            num_corrects = correct[:, :k].sum()
-            num_samples = len(correct)
-            accs.append(float(num_corrects) / num_samples)
-            self.total[i] += num_corrects
-            self.count[i] += num_samples
-        return accs
-
-    def reset(self):
-        self.total = [0.] * len(self.topk)
-        self.count = [0] * len(self.topk)
-
-    def accumulate(self):
-        res = []
-        for t, c in zip(self.total, self.count):
-            res.append(float(t) / c)
-        return res
-
-    def _init_name(self, name):
-        name = name or 'acc'
-        if self.maxk != 1:
-            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
-        else:
-            self._name = [name]
-
-    def name(self):
-        return self._name
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index 0b12987b10a0510e1035e2b64439de9abe3fcf31..977f9233a954746486afee114b2e87a25096ed38 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,19 +24,27 @@ import six
 import warnings
 from collections import Iterable
 
+import paddle
 from paddle import fluid
-from paddle.fluid.framework import in_dygraph_mode, Variable
+from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
+# Note: Use alias `Input` temporarily before releasing hapi feature.
+from paddle.static import InputSpec as Input
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
+from paddle.fluid.executor import scope_guard, Executor
 from paddle.io import DataLoader, Dataset
 
+from paddle.fluid.dygraph.layers import Layer
+from paddle.metric import Metric
+
 from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from .metrics import Metric
 from .callbacks import config_callbacks
 from .utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
 from .device import _get_device
@@ -47,40 +55,6 @@ __all__ = [
 ]
 
 
-class Input(fluid.dygraph.Layer):
-    """
-    Define inputs the model.
-
-    Args:
-        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
-            for more details.
-        shape (tuple(integers)|list[integers]): List|Tuple of integers
-            declaring the shape. You can set "None" or -1 at a dimension
-            to indicate the dimension can be of any size. For example,
-            it is useful to set changeable batch size as "None" or -1.
-        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
-            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
-            uint8. Default: float32.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle.incubate.hapi as hapi
-
-        input = hapi.Input('x', [None, 784], 'float32')
-        label = hapi.Input('label', [None, 1], 'int64')
-    """
-
-    def __init__(self, name, shape=None, dtype='float32'):
-        super(Input, self).__init__()
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-
-    def forward(self):
-        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
-
-
 class StaticGraphAdapter(object):
     """
     Model traning/inference with a static graph.
@@ -388,13 +362,13 @@ class StaticGraphAdapter(object):
         with fluid.program_guard(prog, self._startup_prog):
             inputs = self.model._inputs
             labels = self.model._labels if self.model._labels else []
-            inputs = [k.forward() for k in to_list(inputs)]
-            labels = [k.forward() for k in to_list(labels)]
+            inputs = [k._create_feed_layer() for k in to_list(inputs)]
+            labels = [k._create_feed_layer() for k in to_list(labels)]
             self._label_vars[mode] = labels
             outputs = to_list(self.model.network.forward(*inputs))
 
-            if mode != 'test' and self.model._loss_function:
-                losses = self.model._loss_function(*(outputs + labels))
+            if mode != 'test' and self.model._loss:
+                losses = self.model._loss(*(outputs + labels))
 
             if self._nranks > 1 and mode != 'train':
                 outputs = [_all_gather(o, self._nranks) for o in outputs]
@@ -403,8 +377,7 @@ class StaticGraphAdapter(object):
 
             if mode != 'test':
                 for metric in self.model._metrics:
-                    metrics.append(
-                        to_list(metric.add_metric_op(*(outputs + labels))))
+                    metrics.append(to_list(metric.compute(*(outputs + labels))))
 
             if mode == 'train' and self.model._optimizer:
                 self._loss_endpoint = fluid.layers.sum(losses)
@@ -509,7 +482,7 @@ class DynamicGraphAdapter(object):
 
         if self._nranks > 1:
             outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
             final_loss = fluid.layers.sum(losses)
             final_loss = self.ddp_model.scale_loss(final_loss)
@@ -518,7 +491,7 @@ class DynamicGraphAdapter(object):
         else:
             outputs = self.model.network.forward(
                 * [to_variable(x) for x in inputs])
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
             final_loss = fluid.layers.sum(losses)
             final_loss.backward()
@@ -527,7 +500,7 @@ class DynamicGraphAdapter(object):
         self.model.network.clear_gradients()
         metrics = []
         for metric in self.model._metrics:
-            metric_outs = metric.add_metric_op(*(to_list(outputs) + labels))
+            metric_outs = metric.compute(*(to_list(outputs) + labels))
             m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
@@ -542,8 +515,8 @@ class DynamicGraphAdapter(object):
         labels = [to_variable(l) for l in to_list(labels)]
 
         outputs = self.model.network.forward(* [to_variable(x) for x in inputs])
-        if self.model._loss_function:
-            losses = self.model._loss_function(*(to_list(outputs) + labels))
+        if self.model._loss:
+            losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
 
         if self._nranks > 1:
@@ -571,13 +544,13 @@ class DynamicGraphAdapter(object):
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
 
-            metric_outs = metric.add_metric_op(*(to_list(outputs) + labels))
+            metric_outs = metric.compute(*(to_list(outputs) + labels))
             m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
-        if self.model._loss_function and len(metrics):
+        if self.model._loss and len(metrics):
             return [to_numpy(l) for l in losses], metrics
-        elif self.model._loss_function:
+        elif self.model._loss:
             return [to_numpy(l) for l in losses]
         else:
             return metrics
@@ -665,21 +638,21 @@ class Model(object):
     """
     An Model object is network with training and inference features.
     Dynamic graph and static graph are supported at the same time,
-    switched by `fluid.enable_dygraph()`. The usage is as follows.
+    switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, hapi.Input,
     must be required for static graph.
 
     Args:
-        network (fluid.dygraph.Layer): The network is an instance of
-            fluid.dygraph.Layer.
+        network (paddle.nn.Layer): The network is an instance of
+            paddle.nn.Layer.
         inputs (Input|list|dict|None): `inputs`, entry points of network,
             could be a Input layer, or lits of Input layers,
             or dict (name: Input), or None. For static graph,
             inputs must be set. For dynamic graph, it could be None.
         labels (Input|list|None): `labels`, entry points of network,
             could be a Input layer or lits of Input layers, or None.
-            For static graph, if labels is required in loss_function,
+            For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None.
 
 
@@ -687,13 +660,12 @@ class Model(object):
         .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import paddle.incubate.hapi as hapi
         
-        class MyNet(fluid.dygraph.Layer):
+        class MyNet(paddle.nn.Layer):
             def __init__(self, classifier_act=None):
                 super(MyNet, self).__init__()
-                self._fc1 = fluid.dygraph.Linear(784, 200, act=classifier_act)
+                self._fc1 = paddle.nn.Linear(784, 200, act=classifier_act)
 
             def forward(self, x):
                 y = self._fc1(x)
@@ -701,18 +673,18 @@ class Model(object):
         
         device = hapi.set_device('gpu')
         # if use static graph, do not set
-        fluid.enable_dygraph(device)
+        paddle.disable_static(device)
         
         # inputs and labels are not required for dynamic graph.
-        input = hapi.Input('x', [None, 784], 'float32')
-        label = hapi.Input('label', [None, 1], 'int64')
+        input = hapi.Input([None, 784], 'float32', 'x')
+        label = hapi.Input([None, 1], 'int64', 'label')
         
         model = hapi.Model(MyNet(), input, label)
-        optim = fluid.optimizer.SGD(learning_rate=1e-3,
+        optim = paddle.optimizer.SGD(learning_rate=1e-3,
             parameter_list=model.parameters())
         model.prepare(optim,
                       paddle.nn.CrossEntropyLoss(),
-                      hapi.metrics.Accuracy())
+                      paddle.metric.Accuracy())
         
         mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
         model.fit(mnist_data, epochs=2, batch_size=32, verbose=1)
@@ -724,7 +696,7 @@ class Model(object):
         self.network = network
         self._inputs = None
         self._labels = None
-        self._loss_function = None
+        self._loss = None
         self._loss_weights = None
         self._optimizer = None
         self._optimizer = None
@@ -734,16 +706,8 @@ class Model(object):
             if not isinstance(inputs, (list, dict, Input)):
                 raise TypeError(
                     "'inputs' must be list or dict in static graph mode")
-        if inputs is None:
-            self._inputs = [Input(name=n) \
-                for n in extract_args(self.network.forward) if n != 'self']
-        elif isinstance(input, dict):
-            self._inputs = [inputs[n] \
-                for n in extract_args(self.network.forward) if n != 'self']
-        else:
-            self._inputs = to_list(inputs)
-
-        self._labels = to_list(labels)
+        self._inputs = self._verify_spec(inputs, True)
+        self._labels = self._verify_spec(labels)
 
         # init backend
         if fluid.in_dygraph_mode():
@@ -772,25 +736,24 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self, classifier_act=None):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 10, act=classifier_act)
+                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
 
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
-              input = hapi.Input('x', [None, 784], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 784], 'float32', 'x')
+              label = hapi.Input([None, 1], 'int64', 'label')
               model = hapi.Model(MyNet(), input, label)
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
+              optim = paddle.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
               model.prepare(optim, paddle.nn.CrossEntropyLoss())
               data = np.random.random(size=(4,784)).astype(np.float32)
@@ -821,25 +784,24 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self, classifier_act=None):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 10, act=classifier_act)
+                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
 
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
-              input = hapi.Input('x', [None, 784], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 784], 'float32', 'x')
+              label = hapi.Input([None, 1], 'int64', 'label')
               model = hapi.Model(MyNet(), input, label)
-              optim = fluid.optimizer.SGD(learning_rate=1e-3,
+              optim = paddle.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
               model.prepare(optim,
                             paddle.nn.CrossEntropyLoss())
@@ -867,46 +829,54 @@ class Model(object):
             .. code-block:: python
             
               import numpy as np
-              import paddle.fluid as fluid
+              import paddle
               import paddle.incubate.hapi as hapi
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
 
               model = hapi.Model(MyNet())
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
-              out = model.eval_batch([data])
+              out = model.test_batch([data])
               print(out)
         """
         return self._adapter.test_batch(inputs)
 
-    def save(self, path):
-        """
-        This function saves parameters, optimizer infomation to path.
+    def save(self, path, training=True):
+        """  
+        This function saves parameters, optimizer information or model and 
+        paramters only for inference to path. It depends on the parameter
+        `training`.
 
-        The parameters contains all the trainable Variable, will save to
-        a file with suffix ".pdparams".
+        If `training` is set to True, the parameters saved contain all 
+        the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
         information will save to a file with suffix ".pdopt". (If the optimizer
         have no variable need to save (like SGD), the fill will not generated).
+        This function will silently overwrite existing file at the target location.
 
-        This function will silently overwrite existing file
-        at the target location.
+        If `training` is set to False, only inference model will be saved. It 
+        should be noted that before using `save`, you should run the model, and 
+        the shape of input you saved is as same as the input of its running.
+        `@paddle.jit.to_static` must be added on `forward` function of your layer 
+        in dynamic mode now and these will be optimized later.
 
         Args:
             path (str): The file prefix to save model. The format is
                 'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
                  will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. Default: True.
 
         Returns:
             None
@@ -914,25 +884,47 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
-              import paddle.fluid as fluid
-              import paddle.incubate.hapi as hapi
-              
-              class MyNet(fluid.dygraph.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                import paddle
+                import paddle.incubate.hapi as hapi
+                from paddle.nn import Linear
+                from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+                class Mnist(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyNet, self).__init__()
+                        self._fc = Linear(784, 1, act='softmax')
+
+                  @paddle.jit.to_static # If save for inference in dygraph, need this
                   def forward(self, x):
                       y = self._fc(x)
                       return y
-              
-              device = hapi.set_device('cpu')
-              fluid.enable_dygraph(device)
-              model = hapi.Model(MyNet())
-              model.save('checkpoint/test')
+
+                dynamic = True # False
+                device = hapi.set_device('cpu')
+                # if use static graph, do not set
+                paddle.disable_static(device) if dynamic else None
+
+                # inputs and labels are not required for dynamic graph.
+                input = hapi.Input([None, 784], 'float32', 'x')
+                label = hapi.Input([None, 1], 'int64', 'label')
+
+                model = hapi.Model(Mnist(), input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameter_list=model.parameters())
+                model.prepare(optim,
+                                paddle.nn.CrossEntropyLoss(),
+                                hapi.metrics.Accuracy())
+                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
+                model.save('checkpoint/test') # save for training
+                model.save('inference_model', False) # save for inference
         """
+
         if ParallelEnv().local_rank == 0:
-            self._adapter.save(path)
+            if not training:
+                self._save_inference_model(path)
+            else:
+                self._adapter.save(path)
 
     def load(self, path, skip_mismatch=False, reset_optimizer=False):
         """
@@ -967,19 +959,19 @@ class Model(object):
 
             .. code-block:: python
             
-              import paddle.fluid as fluid
+              import paddle
               import paddle.incubate.hapi as hapi
               
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
               
               device = hapi.set_device('cpu')
-              fluid.enable_dygraph(device)
+              paddle.disable_static(device)
               model = hapi.Model(MyNet())
               model.load('checkpoint/test')
         """
@@ -1042,24 +1034,24 @@ class Model(object):
 
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.incubate.hapi import Model
 
-              class MyNet(fluid.dygraph.Layer):
+              class MyNet(paddle.nn.Layer):
                   def __init__(self):
                       super(MyNet, self).__init__()
-                      self._fc = fluid.dygraph.Linear(20, 10, act='softmax')
+                      self._fc = paddle.nn.Linear(20, 10, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
-              fluid.enable_dygraph()
+              paddle.disable_static()
               model = Model(MyNet())
               params = model.parameters()
         """
         return self._adapter.parameters()
 
-    def prepare(self, optimizer=None, loss_function=None, metrics=None):
+    def prepare(self, optimizer=None, loss=None, metrics=None):
         """
         Configures the model before runing.
 
@@ -1067,8 +1059,8 @@ class Model(object):
             optimizer (Optimizer|None): Optimizer must be set in training
                 and should be a Optimizer instance. It can be None in eval
                 and test mode.
-            loss_function (Loss|callable function|None): Loss function can
-                be a `fluid.dygraph.Layer` instance or any callable function
+            loss (Loss|callable function|None): Loss function can
+                be a `paddle.nn.Layer` instance or any callable function
                 taken the predicted values and ground truth values as input.
                 It can be None when there is no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
@@ -1087,7 +1079,7 @@ class Model(object):
                     startup_prog_seed = fluid.default_startup_program(
                     ).random_seed
                     fluid.disable_dygraph()
-                    fluid.enable_dygraph(self._place)
+                    paddle.disable_static(self._place)
                     # enable_dygraph would create and switch to a new program,
                     # thus also copy seed to the new program
                     fluid.default_main_program().random_seed = main_prog_seed
@@ -1099,12 +1091,11 @@ class Model(object):
                 _parallel_context_initialized = True
 
         self._optimizer = optimizer
-        if loss_function:
-            if not isinstance(loss_function, fluid.dygraph.Layer) or \
-               not callable(loss_function):
-                raise TypeError("'loss_function' must be sub classes of \
-                    `fluid.dygraph.Layer` or any callable function.")
-        self._loss_function = loss_function
+        if loss is not None:
+            if not isinstance(loss, paddle.nn.Layer) and not callable(loss):
+                raise TypeError("'loss' must be sub classes of " \
+                    "`paddle.nn.Layer` or any callable function.")
+        self._loss = loss
 
         metrics = metrics or []
         for metric in to_list(metrics):
@@ -1184,27 +1175,26 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
               dynamic = True
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device) if dynamic else None
+              paddle.disable_static(device) if dynamic else None
            
               train_dataset = hapi.datasets.MNIST(mode='train')
               val_dataset = hapi.datasets.MNIST(mode='test')
            
-              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
+              label = hapi.Input([None, 1], 'int64', 'label')
            
               model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
                   input, label)
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
               model.prepare(
                   optim,
                   paddle.nn.CrossEntropyLoss(),
-                  hapi.metrics.Accuracy(topk=(1, 2)))
+                  paddle.metric.Accuracy(topk=(1, 2)))
               model.fit(train_dataset,
                         val_dataset,
                         epochs=2,
@@ -1217,31 +1207,30 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.fluid as fluid
               import paddle.incubate.hapi as hapi
 
               dynamic = True
               device = hapi.set_device('gpu')
-              fluid.enable_dygraph(device) if dynamic else None
+              paddle.disable_static(device) if dynamic else None
            
               train_dataset = hapi.datasets.MNIST(mode='train')
-              train_loader = fluid.io.DataLoader(train_dataset,
+              train_loader = paddle.io.DataLoader(train_dataset,
                   places=device, batch_size=64)
               val_dataset = hapi.datasets.MNIST(mode='test')
-              val_loader = fluid.io.DataLoader(val_dataset,
+              val_loader = paddle.io.DataLoader(val_dataset,
                   places=device, batch_size=64)
            
-              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
-              label = hapi.Input('label', [None, 1], 'int64')
+              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
+              label = hapi.Input([None, 1], 'int64', 'label')
            
               model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
                   input, label)
-              optim = fluid.optimizer.Adam(
-                  learning_rate=0.001, parameter_list=model.parameters())
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
               model.prepare(
                   optim,
                   paddle.nn.CrossEntropyLoss(),
-                  hapi.metrics.Accuracy(topk=(1, 2)))
+                  paddle.metric.Accuracy(topk=(1, 2)))
               model.fit(train_loader,
                         val_loader,
                         epochs=2,
@@ -1353,24 +1342,24 @@ class Model(object):
         Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import paddle.incubate.hapi as hapi
 
             # declarative mode
             val_dataset = hapi.datasets.MNIST(mode='test')
 
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
-            label = hapi.Input('label', [None, 1], 'int64')
+            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
+            label = hapi.Input([None, 1], 'int64', 'label')
             model = hapi.Model(hapi.vision.LeNet(), input, label)
-            model.prepare(metrics=hapi.metrics.Accuracy())
+            model.prepare(metrics=paddle.metric.Accuracy())
 
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
 
             # imperative mode
-            fluid.enable_dygraph()
+            paddle.disable_static()
             model = hapi.Model(hapi.vision.LeNet())
-            model.prepare(metrics=hapi.metrics.Accuracy())
+            model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
                 
@@ -1433,12 +1422,13 @@ class Model(object):
             num_workers (int): The number of subprocess to load data, 0 for no subprocess 
                 used and loading data in main process. When train_data and eval_data are
                 both the instance of Dataloader, this argument will be ignored. Default: 0.
-            stack_output (bool): Whether stack output field like a batch, as for an output
+            stack_outputs (bool): Whether stack output field like a batch, as for an output
                 filed of a sample is in shape [X, Y], test_data contains N samples, predict
                 output field will be in shape [N, X, Y] if stack_output is True, and will
                 be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
                 is False. stack_outputs as False is used for LoDTensor output situation,
                 it is recommended set as True if outputs contains no LoDTensor. Default: False.
+            callbacks(Callback): A Callback instance, default None.
         Returns:
             list: output of models.
 
@@ -1446,7 +1436,7 @@ class Model(object):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
+            import paddle
             import paddle.incubate.hapi as hapi
 
             class MnistDataset(hapi.datasets.MNIST):
@@ -1466,7 +1456,7 @@ class Model(object):
             test_dataset = MnistDataset(mode='test', return_label=False)
 
             # declarative mode
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
+            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
             model = hapi.Model(hapi.vision.LeNet(), input)
             model.prepare()
 
@@ -1475,7 +1465,7 @@ class Model(object):
 
             # imperative mode
             device = hapi.set_device('cpu')
-            fluid.enable_dygraph(device)
+            paddle.disable_static(device)
             model = hapi.Model(hapi.vision.LeNet())
             model.prepare()
             result = model.predict(test_dataset, batch_size=64)
@@ -1519,13 +1509,17 @@ class Model(object):
         cbks.on_end('test', logs)
         return outputs
 
-    def save_inference_model(self,
-                             save_dir,
-                             model_filename=None,
-                             params_filename=None,
-                             model_only=False):
+    def _save_inference_model(self,
+                              save_dir,
+                              model_filename=None,
+                              params_filename=None,
+                              model_only=False):
         """
-        Save inference model must in static mode.
+        Save inference model can be in static or dynamic mode.
+        It should be noted that before using `save_inference_model`, you should
+        run the model, and the shape you saved is as same as the input of its
+        running. `@paddle.jit.to_static` must be added on `forward` function of
+        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1541,40 +1535,145 @@ class Model(object):
         Returns:
             list: The fetch variables' name list
 
-
         Examples:
         .. code-block:: python
+            import numpy as np
+            import paddle
+            from paddle.static import InputSpec
 
-            import paddle.fluid as fluid
             import paddle.incubate.hapi as hapi
-
-            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
-            model = hapi.Model(hapi.vision.LeNet(), input)
-            model.prepare()
-
+            from paddle.nn import Linear
+            from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+
+            class Mnist(Layer):
+                def __init__(self, classifier_act=None):
+                    super(Mnist, self).__init__()
+
+                    self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
+
+                @paddle.jit.to_static # In static mode, you need to delete this.
+                def forward(self, inputs):
+                    outputs = self.fc(inputs)
+                    return outputs
+
+            dynamic = True # False
+            device = hapi.set_device('gpu')
+
+            # if use static graph, do not set
+            paddle.disable_static(device) if dynamic else None
+
+            # inputs and labels are not required for dynamic graph.
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+
+            model = hapi.Model(Mnist(), input, label)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                parameter_list=model.parameters())
+            model.prepare(optim,
+                            paddle.nn.CrossEntropyLoss(),
+                            hapi.metrics.Accuracy())
+            mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
+            model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
             model.save_inference_model('inference_model')
         """
-        assert not fluid.in_dygraph_mode(
-        ), 'Save inference model must in static mode!'
 
-        prog = self._adapter._progs.get('test', None)
-        assert prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        def get_inout_spec(all_vars, return_name=False):
+            result_list = []
+            valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+            result_list = valid_vars
+            if return_name:
+                result_list = [var.name for var in result_list]
 
-        infer_prog = prog.clone(for_test=True)
+            return result_list
 
-        input_names = [v.name for v in self._adapter._input_vars['test']]
-        endpoints = self._adapter._endpoints['test']['output']
+        # TODO:
+        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
+        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
+        #    the inputs of the model in running.
+        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
+        if fluid.in_dygraph_mode():
+            layer = self.network
+            fluid.disable_dygraph()
+
+            # 1. input check
+            prog_translator = ProgramTranslator()
+            if not prog_translator.enable_declarative:
+                raise RuntimeError(
+                    "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                )
+            if not isinstance(layer, Layer):
+                raise TypeError(
+                    "The input layer should be 'Layer', but received layer type is %s."
+                    % type(layer))
+
+            # 2. get program of declarative Layer.forward
+            prog_cache = prog_translator.get_program_cache()
+            # make dummy args & kwargs, to get excepted FunctionSpec
+            layer_func = FunctionSpec(type(layer).forward, [layer], {})
+            concrete_program, _ = prog_cache.get_program(layer_func)
+
+            # NOTE: we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in layer.state_dict().items():
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            scope = core.Scope()
+            extra_var_info = dict()
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                extra_info_dict = dict()
+                if param_or_buffer.name in state_names_dict:
+                    extra_info_dict['structured_name'] = state_names_dict[
+                        param_or_buffer.name]
+                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                if isinstance(param_or_buffer, ParamBase):
+                    extra_info_dict['trainable'] = param_or_buffer.trainable
+                extra_var_info[param_or_buffer.name] = extra_info_dict
+
+            # 4. build input & output spec
+            input_var_names = get_inout_spec(concrete_program.inputs, True)
+            output_vars = get_inout_spec(concrete_program.outputs)
+
+            # 5. save inference model
+            with scope_guard(scope):
+                return fluid.io.save_inference_model(
+                    dirname=save_dir,
+                    feeded_var_names=input_var_names,
+                    target_vars=output_vars,
+                    executor=Executor(_current_expected_place()),
+                    main_program=concrete_program.main_program.clone(),
+                    model_filename=model_filename,
+                    params_filename=params_filename,
+                    program_only=model_only)
+
+        else:
+            prog = self._adapter._progs.get('test', None)
+            assert prog, \
+                "Model is not ready, please call `model.prepare()` first"
 
-        return fluid.io.save_inference_model(
-            save_dir,
-            input_names,
-            endpoints,
-            self._adapter._executor,
-            main_program=infer_prog,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            program_only=model_only)
+            infer_prog = prog.clone(for_test=True)
+
+            input_names = [v.name for v in self._adapter._input_vars['test']]
+            endpoints = self._adapter._endpoints['test']['output']
+
+            return fluid.io.save_inference_model(
+                save_dir,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+                program_only=model_only)
 
     def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
         outputs = []
@@ -1601,9 +1700,9 @@ class Model(object):
             if mode != 'test':
                 outs = getattr(self, mode + '_batch')(data[:len(self._inputs)],
                                                       data[len(self._inputs):])
-                if self._metrics and self._loss_function:
+                if self._metrics and self._loss:
                     metrics = [[l[0] for l in outs[0]]]
-                elif self._loss_function:
+                elif self._loss:
                     metrics = [[l[0] for l in outs]]
                 else:
                     metrics = []
@@ -1639,12 +1738,42 @@ class Model(object):
             return logs, outputs
         return logs
 
+    def _verify_spec(self, specs, is_input=False):
+        out_specs = []
+
+        if specs is None:
+            # If not specific specs of `Input`, using argument names of `forward` function
+            # to generate `Input`.
+            if is_input:
+                out_specs = [
+                    Input(name=n) for n in extract_args(self.network.forward)
+                    if n != 'self'
+                ]
+            else:
+                out_specs = to_list(specs)
+        elif isinstance(specs, dict):
+            assert is_input == False
+            out_specs = [specs[n] \
+                for n in extract_args(self.network.forward) if n != 'self']
+        else:
+            out_specs = to_list(specs)
+        # Note: checks each element has specificed `name`.
+        if out_specs is not None:
+            for i, spec in enumerate(out_specs):
+                assert isinstance(spec, Input)
+                if spec.name is None:
+                    raise ValueError(
+                        "Requires Input[{}].name != None, but receive `None` with {}.".
+                        format(i, spec))
+
+        return out_specs
+
     def _reset_metrics(self):
         for metric in self._metrics:
             metric.reset()
 
     def _metrics_name(self):
-        metrics_name = ['loss'] if self._loss_function else []
+        metrics_name = ['loss'] if self._loss else []
         for m in self._metrics:
             metrics_name.extend(to_list(m.name()))
         return metrics_name
diff --git a/python/paddle/incubate/hapi/tests/CMakeLists.txt b/python/paddle/incubate/hapi/tests/CMakeLists.txt
index 5cad495de7c88781de50de9b2bbe1a765a45582f..8ffcd67443f1c8722560da20d9cfb76b18a67351 100644
--- a/python/paddle/incubate/hapi/tests/CMakeLists.txt
+++ b/python/paddle/incubate/hapi/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ endforeach()
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150) 
 
 
 function(py_dist_test TARGET_NAME)
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
index b338f3310b4c796e66d88b21f1bb8353dbf5b572..ede99a50c2fa72da3bd1999204a5fe1e5a656be2 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
@@ -25,7 +25,7 @@ from paddle import fluid
 from paddle.incubate.hapi import Model, Input, set_device
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.callbacks import ProgBarLogger
 from paddle.incubate.hapi.datasets import MNIST
 
@@ -64,8 +64,8 @@ class TestDistTraning(unittest.TestCase):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input('image', im_shape, 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input(im_shape, 'float32', 'image')]
+        labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.Momentum(
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
index 1484620a4efdfff0c084153e9edb001833d744ef..28305fc6a6fd08c160f946920e85391cd444caef 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
@@ -25,7 +25,7 @@ from paddle import fluid
 from paddle.incubate.hapi import Model, Input, set_device
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.callbacks import ProgBarLogger
 from paddle.incubate.hapi.datasets import MNIST
 
@@ -63,8 +63,8 @@ class TestDistTraning(unittest.TestCase):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input('image', im_shape, 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input(im_shape, 'float32', 'image')]
+        labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.Momentum(
diff --git a/python/paddle/incubate/hapi/tests/test_callbacks.py b/python/paddle/incubate/hapi/tests/test_callbacks.py
index 2a8a470736d921628edadb55b7e0cc956e2f37f1..e49bf215c276c8b495b0f991a5821d4c674f48d2 100644
--- a/python/paddle/incubate/hapi/tests/test_callbacks.py
+++ b/python/paddle/incubate/hapi/tests/test_callbacks.py
@@ -36,7 +36,7 @@ class TestCallbacks(unittest.TestCase):
         freq = 2
         eval_steps = 20
 
-        inputs = [Input('image', [None, 1, 28, 28], 'float32')]
+        inputs = [Input([None, 1, 28, 28], 'float32', 'image')]
         lenet = Model(LeNet(), inputs)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_cifar.py b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d9f4353c0ed639f5ad907c921bf7b2c88271f5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestCifar10Train(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar10(mode='train')
+        self.assertTrue(len(cifar) == 50000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 50000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestCifar10Test(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar10(mode='test')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestCifar100Train(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar100(mode='train')
+        self.assertTrue(len(cifar) == 50000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 50000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 99)
+
+
+class TestCifar100Test(unittest.TestCase):
+    def test_main(self):
+        cifar = Cifar100(mode='test')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(0 <= int(label) <= 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_conll05.py b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed2a4180d0cb341f5d57bdf1cb9d8ef145a44fb
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestConll05st(unittest.TestCase):
+    def test_main(self):
+        conll05st = Conll05st()
+        self.assertTrue(len(conll05st) == 5267)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5267)
+        sample = conll05st[idx]
+        self.assertTrue(len(sample) == 9)
+        for s in sample:
+            self.assertTrue(len(s.shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imdb.py b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef73634b6b5fb114fa88b785bb77a87fe129bd5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestImdbTrain(unittest.TestCase):
+    def test_main(self):
+        imdb = Imdb(mode='train')
+        self.assertTrue(len(imdb) == 25000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 25000)
+        data, label = imdb[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
+        self.assertTrue(int(label) in [0, 1])
+
+
+class TestImdbTest(unittest.TestCase):
+    def test_main(self):
+        imdb = Imdb(mode='test')
+        self.assertTrue(len(imdb) == 25000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 25000)
+        data, label = imdb[idx]
+        self.assertTrue(len(data.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
+        self.assertTrue(int(label) in [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3d97d314acbf7f55a8482fd386581fef7f16e03
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestImikolovTrain(unittest.TestCase):
+    def test_main(self):
+        imikolov = Imikolov(mode='train', data_type='NGRAM', window_size=2)
+        self.assertTrue(len(imikolov) == 929589)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 929589)
+        data = imikolov[idx]
+        self.assertTrue(len(data) == 2)
+
+
+class TestImikolovTest(unittest.TestCase):
+    def test_main(self):
+        imikolov = Imikolov(mode='test', data_type='NGRAM', window_size=2)
+        self.assertTrue(len(imikolov) == 82430)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 82430)
+        data = imikolov[idx]
+        self.assertTrue(len(data) == 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a7a3035ee0e86f8ee2fa9e8a23f6036758d2d
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestMovieReviewsTrain(unittest.TestCase):
+    def test_main(self):
+        movie_reviews = MovieReviews(mode='train')
+        self.assertTrue(len(movie_reviews) == 1600)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1600)
+        data = movie_reviews[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(int(data[1]) in [0, 1])
+
+
+class TestMovieReviewsTest(unittest.TestCase):
+    def test_main(self):
+        movie_reviews = MovieReviews(mode='test')
+        self.assertTrue(len(movie_reviews) == 400)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 400)
+        data = movie_reviews[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(int(data[1]) in [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movielens.py b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94269f930e05e04b3bdfc4324e5ae1ea15b1fb9
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestMovielensTrain(unittest.TestCase):
+    def test_main(self):
+        movielens = Movielens(mode='train')
+        # movielens dataset random split train/test
+        # not check dataset length here
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 900000)
+        data = movielens[idx]
+        self.assertTrue(len(data) == 8)
+        for i, d in enumerate(data):
+            self.assertTrue(len(d.shape) == 1)
+            if i not in [5, 6]:
+                self.assertTrue(d.shape[0] == 1)
+
+
+class TestMovielensTest(unittest.TestCase):
+    def test_main(self):
+        movielens = Movielens(mode='test')
+        # movielens dataset random split train/test
+        # not check dataset length here
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 100000)
+        data = movielens[idx]
+        self.assertTrue(len(data) == 8)
+        for i, d in enumerate(data):
+            self.assertTrue(len(d.shape) == 1)
+            if i not in [5, 6]:
+                self.assertTrue(d.shape[0] == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..768367bff9911a352ea6b13f279d5b71938bc85b
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestUCIHousingTrain(unittest.TestCase):
+    def test_main(self):
+        uci_housing = UCIHousing(mode='train')
+        self.assertTrue(len(uci_housing) == 404)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 404)
+        data = uci_housing[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(data[0].shape[0] == 13)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(data[1].shape[0] == 1)
+
+
+class TestUCIHousingTest(unittest.TestCase):
+    def test_main(self):
+        uci_housing = UCIHousing(mode='test')
+        self.assertTrue(len(uci_housing) == 102)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 102)
+        data = uci_housing[idx]
+        self.assertTrue(len(data) == 2)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(data[0].shape[0] == 13)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(data[1].shape[0] == 1)
+
+
+class TestWMT14Train(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='train', dict_size=50)
+        self.assertTrue(len(wmt14) == 191155)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 191155)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Test(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='test', dict_size=50)
+        self.assertTrue(len(wmt14) == 5957)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5957)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Gen(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='gen', dict_size=50)
+        self.assertTrue(len(wmt14) == 3001)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3001)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_voc.py b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..85766ab8e30a3a7abd5e2966e6353b116c03e926
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_voc.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import voc2012, VOC2012
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+# VOC2012 is too large for unittest to download, stub a small dataset here
+voc2012.VOC_URL = 'https://paddlemodels.bj.bcebos.com/voc2012_stub/VOCtrainval_11-May-2012.tar'
+voc2012.VOC_MD5 = '34cb1fe5bdc139a5454b25b16118fff8'
+
+
+class TestVOC2012Train(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='train')
+        self.assertTrue(len(voc2012) == 3)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+class TestVOC2012Valid(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='valid')
+        self.assertTrue(len(voc2012) == 1)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+class TestVOC2012Test(unittest.TestCase):
+    def test_main(self):
+        voc2012 = VOC2012(mode='test')
+        self.assertTrue(len(voc2012) == 2)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_wmt.py b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..987e55676aadb77582c58b13e626d7258f3c75b5
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import tempfile
+import shutil
+import cv2
+
+from paddle.incubate.hapi.datasets import *
+from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+
+
+class TestWMT14Train(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='train', dict_size=50)
+        self.assertTrue(len(wmt14) == 191155)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 191155)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Test(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='test', dict_size=50)
+        self.assertTrue(len(wmt14) == 5957)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 5957)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT14Gen(unittest.TestCase):
+    def test_main(self):
+        wmt14 = WMT14(mode='gen', dict_size=50)
+        self.assertTrue(len(wmt14) == 3001)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 3001)
+        data = wmt14[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Train(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(
+            mode='train', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 29000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 29000)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Test(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(
+            mode='test', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 1000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1000)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+class TestWMT16Val(unittest.TestCase):
+    def test_main(self):
+        wmt16 = WMT16(mode='val', src_dict_size=50, trg_dict_size=50, lang='en')
+        self.assertTrue(len(wmt16) == 1014)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1014)
+        data = wmt16[idx]
+        self.assertTrue(len(data) == 3)
+        self.assertTrue(len(data[0].shape) == 1)
+        self.assertTrue(len(data[1].shape) == 1)
+        self.assertTrue(len(data[2].shape) == 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_metrics.py b/python/paddle/incubate/hapi/tests/test_metrics.py
deleted file mode 100644
index 3d25a275d5f1c539ce959c5231a7af771b229836..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/tests/test_metrics.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.base import to_variable
-
-from paddle.incubate.hapi.metrics import *
-from paddle.incubate.hapi.utils import to_list
-
-
-def accuracy(pred, label, topk=(1, )):
-    maxk = max(topk)
-    pred = np.argsort(pred)[:, ::-1][:, :maxk]
-    correct = (pred == np.repeat(label, maxk, 1))
-
-    batch_size = label.shape[0]
-    res = []
-    for k in topk:
-        correct_k = correct[:, :k].sum()
-        res.append(correct_k / batch_size)
-    return res
-
-
-def convert_to_one_hot(y, C):
-    oh = np.random.random((y.shape[0], C)).astype('float32') * .5
-    for i in range(y.shape[0]):
-        oh[i, int(y[i])] = 1.
-    return oh
-
-
-class TestAccuracyDynamic(unittest.TestCase):
-    def setUp(self):
-        self.topk = (1, )
-        self.class_num = 5
-        self.sample_num = 1000
-        self.name = None
-
-    def random_pred_label(self):
-        label = np.random.randint(0, self.class_num,
-                                  (self.sample_num, 1)).astype('int64')
-        pred = np.random.randint(0, self.class_num,
-                                 (self.sample_num, 1)).astype('int32')
-        pred_one_hot = convert_to_one_hot(pred, self.class_num)
-        pred_one_hot = pred_one_hot.astype('float32')
-
-        return label, pred_one_hot
-
-    def test_main(self):
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            acc = Accuracy(topk=self.topk, name=self.name)
-            for _ in range(10):
-                label, pred = self.random_pred_label()
-                label_var = to_variable(label)
-                pred_var = to_variable(pred)
-                state = to_list(acc.add_metric_op(pred_var, label_var))
-                acc.update(* [s.numpy() for s in state])
-                res_m = acc.accumulate()
-                res_f = accuracy(pred, label, self.topk)
-                assert np.all(np.isclose(np.array(res_m, dtype='float64'), np.array(res_f, dtype='float64'), rtol=1e-3)), \
-                        "Accuracy precision error: {} != {}".format(res_m, res_f)
-                acc.reset()
-                assert np.sum(acc.total) == 0
-                assert np.sum(acc.count) == 0
-
-
-class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-class TestAccuracyStatic(TestAccuracyDynamic):
-    def test_main(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            pred = fluid.data(
-                name='pred', shape=[None, self.class_num], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            acc = Accuracy(topk=self.topk, name=self.name)
-            state = acc.add_metric_op(pred, label)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        compiled_main_prog = fluid.CompiledProgram(main_prog)
-
-        for _ in range(10):
-            label, pred = self.random_pred_label()
-            state_ret = exe.run(compiled_main_prog,
-                                feed={'pred': pred,
-                                      'label': label},
-                                fetch_list=[s.name for s in to_list(state)],
-                                return_numpy=True)
-            acc.update(*state_ret)
-            res_m = acc.accumulate()
-            res_f = accuracy(pred, label, self.topk)
-            assert np.all(np.isclose(np.array(res_m, dtype='float64'), np.array(res_f, dtype='float64'), rtol=1e-3)), \
-                    "Accuracy precision error: {} != {}".format(res_m, res_f)
-            acc.reset()
-            assert np.sum(acc.total) == 0
-            assert np.sum(acc.count) == 0
-
-
-class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
-    def setUp(self):
-        self.topk = (1, 5)
-        self.class_num = 10
-        self.sample_num = 1000
-        self.name = "accuracy"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index f8be2e242568de10bfbf14fb3b88ef88fb0094da..96c432e1bfd8f3620c705be62c4e9d90c2709fa5 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -23,16 +23,18 @@ import shutil
 import tempfile
 
 from paddle import fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential
 from paddle.fluid.dygraph.base import to_variable
 
 import paddle.incubate.hapi as hapi
 from paddle.incubate.hapi import Model, Input
 from paddle.nn.layer.loss import CrossEntropyLoss
-from paddle.incubate.hapi.metrics import Accuracy
+from paddle.metric import Accuracy
 from paddle.incubate.hapi.datasets import MNIST
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
+from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 
 class LeNetDygraph(fluid.dygraph.Layer):
@@ -40,11 +42,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
@@ -65,6 +67,37 @@ class LeNetDygraph(fluid.dygraph.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation=None):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120),
+                Linear(120, 84),
+                Linear(
+                    84, 10, act=classifier_activation))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -150,8 +183,8 @@ class TestModel(unittest.TestCase):
 
         cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
 
-        cls.inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
-        cls.labels = [Input('label', [None, 1], 'int64')]
+        cls.inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
+        cls.labels = [Input([None, 1], 'int64', 'label')]
 
         cls.save_dir = tempfile.mkdtemp()
         cls.weight_path = os.path.join(cls.save_dir, 'lenet')
@@ -169,6 +202,12 @@ class TestModel(unittest.TestCase):
     def test_fit_static(self):
         self.fit(False)
 
+    def test_fit_dynamic_with_rank(self):
+        self.fit(True, 2, 0)
+
+    def test_fit_static_with_rank(self):
+        self.fit(False, 2, 0)
+
     def test_evaluate_dygraph(self):
         self.evaluate(True)
 
@@ -184,7 +223,7 @@ class TestModel(unittest.TestCase):
     def test_prepare_context(self):
         prepare_distributed_context()
 
-    def fit(self, dynamic):
+    def fit(self, dynamic, num_replicas=None, rank=None):
         fluid.enable_dygraph(self.device) if dynamic else None
         seed = 333
         fluid.default_startup_program().random_seed = seed
@@ -196,7 +235,7 @@ class TestModel(unittest.TestCase):
         model = Model(net, inputs=self.inputs, labels=self.labels)
         model.prepare(
             optim_new,
-            loss_function=CrossEntropyLoss(reduction="sum"),
+            loss=CrossEntropyLoss(reduction="sum"),
             metrics=Accuracy())
         model.fit(self.train_dataset, batch_size=64, shuffle=False)
 
@@ -204,9 +243,17 @@ class TestModel(unittest.TestCase):
         np.testing.assert_allclose(result['acc'], self.acc1)
 
         train_sampler = DistributedBatchSampler(
-            self.train_dataset, batch_size=64, shuffle=False)
+            self.train_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
         val_sampler = DistributedBatchSampler(
-            self.val_dataset, batch_size=64, shuffle=False)
+            self.val_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
 
         train_loader = fluid.io.DataLoader(
             self.train_dataset,
@@ -316,13 +363,11 @@ class TestModelFunction(unittest.TestCase):
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
-            inputs = [Input('x', [None, dim], 'float32')]
-            labels = [Input('label', [None, 1], 'int64')]
+            inputs = [Input([None, dim], 'float32', 'x')]
+            labels = [Input([None, 1], 'int64', 'label')]
             model = Model(net, inputs, labels)
-            model.prepare(
-                optim2, loss_function=CrossEntropyLoss(reduction="sum"))
+            model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
             loss, = model.train_batch([data], [label])
-
             np.testing.assert_allclose(loss.flatten(), ref.flatten())
             fluid.disable_dygraph() if dynamic else None
 
@@ -345,7 +390,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
             net = MyModel()
-            inputs = [Input('x', [None, dim], 'float32')]
+            inputs = [Input([None, dim], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             out, = model.test_batch([data])
@@ -359,14 +404,13 @@ class TestModelFunction(unittest.TestCase):
             device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel(classifier_activation=None)
-            inputs = [Input('x', [None, 20], 'float32')]
-            labels = [Input('label', [None, 1], 'int64')]
+            inputs = [Input([None, 20], 'float32', 'x')]
+            labels = [Input([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=net.parameters())
             model = Model(net, inputs, labels)
             model.prepare(
-                optimizer=optim,
-                loss_function=CrossEntropyLoss(reduction="sum"))
+                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
             model.save(path + '/test')
             model.load(path + '/test')
             shutil.rmtree(path)
@@ -380,18 +424,16 @@ class TestModelFunction(unittest.TestCase):
         model = Model(MyModel(classifier_activation=None))
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
         fluid.disable_dygraph()
 
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.load(path + '/test')
         shutil.rmtree(path)
 
@@ -399,26 +441,24 @@ class TestModelFunction(unittest.TestCase):
         path = tempfile.mkdtemp()
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
 
         device = hapi.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input('x', [None, 20], 'float32')]
-        labels = [Input('label', [None, 1], 'int64')]
+        inputs = [Input([None, 20], 'float32', 'x')]
+        labels = [Input([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
-        model.prepare(
-            optimizer=optim, loss_function=CrossEntropyLoss(reduction="sum"))
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.load(path + '/test')
         shutil.rmtree(path)
         fluid.disable_dygraph()
@@ -428,7 +468,7 @@ class TestModelFunction(unittest.TestCase):
             device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel()
-            inputs = [Input('x', [None, 20], 'float32')]
+            inputs = [Input([None, 20], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             params = model.parameters()
@@ -437,33 +477,48 @@ class TestModelFunction(unittest.TestCase):
             fluid.disable_dygraph() if dynamic else None
 
     def test_export_deploy_model(self):
-        net = LeNet()
-        inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
-        model = Model(net, inputs)
-        model.prepare()
-        save_dir = tempfile.mkdtemp()
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        tensor_img = np.array(
-            np.random.random((1, 1, 28, 28)), dtype=np.float32)
-        ori_results = model.test_batch(tensor_img)
-
-        model.save_inference_model(save_dir)
-
-        place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=save_dir, executor=exe))
+        for dynamic in [True, False]:
+            fluid.enable_dygraph() if dynamic else None
+            # paddle.disable_static() if dynamic else None
+            prog_translator = ProgramTranslator()
+            prog_translator.enable(False) if not dynamic else None
+            net = LeNetDeclarative()
+            inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            save_dir = tempfile.mkdtemp()
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            tensor_img = np.array(
+                np.random.random((1, 1, 28, 28)), dtype=np.float32)
+            ori_results = model.test_batch(tensor_img)
+            model.save(save_dir, training=False)
+            fluid.disable_dygraph() if dynamic else None
 
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
+            place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            new_scope = fluid.Scope()
+            with fluid.scope_guard(new_scope):
+                exe = fluid.Executor(place)
+                [inference_program, feed_target_names, fetch_targets] = (
+                    fluid.io.load_inference_model(
+                        dirname=save_dir, executor=exe))
+                results = exe.run(inference_program,
+                                  feed={feed_target_names[0]: tensor_img},
+                                  fetch_list=fetch_targets)
+                np.testing.assert_allclose(
+                    results, ori_results, rtol=1e-5, atol=1e-7)
+                shutil.rmtree(save_dir)
+
+
+class TestRaiseError(unittest.TestCase):
+    def test_input_without_name(self):
+        net = MyModel(classifier_activation=None)
 
-        np.testing.assert_allclose(results, ori_results, rtol=1e-6)
-        shutil.rmtree(save_dir)
+        inputs = [Input([None, 10], 'float32')]
+        labels = [Input([None, 1], 'int64', 'label')]
+        with self.assertRaises(ValueError):
+            model = Model(net, inputs, labels)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/incubate/hapi/tests/test_pretrained_model.py b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
index 588797322f4ab8e9eef9cc184cc6d82635de7d01..334ebff449d4f34c9a5a9b56ee7998b4dbc5abf0 100644
--- a/python/paddle/incubate/hapi/tests/test_pretrained_model.py
+++ b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
@@ -28,7 +28,7 @@ class TestPretrainedModel(unittest.TestCase):
             fluid.enable_dygraph()
 
         net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [Input('image', [None, 3, 224, 224], 'float32')]
+        inputs = [Input([None, 3, 224, 224], 'float32', 'image')]
         model = Model(network=net, inputs=inputs)
         model.prepare()
         res = model.test_batch(x)
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/incubate/hapi/tests/test_text.py
index 78f089b06a38dec4eb189a9744e503f517f220db..bdc637997b0cbd8389fdfab9f71597c62b0e21a3 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
@@ -142,7 +142,7 @@ class TestBasicLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -168,7 +168,7 @@ class TestBasicGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -219,8 +219,8 @@ class TestBeamSearch(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("init_hidden", [None, self.inputs[0].shape[-1]], "float32"),
-            Input("init_cell", [None, self.inputs[1].shape[-1]], "float32"),
+            Input([None, self.inputs[0].shape[-1]], "float32", "init_hidden"),
+            Input([None, self.inputs[1].shape[-1]], "float32", "init_cell"),
         ]
         return inputs
 
@@ -272,10 +272,10 @@ class TestTransformerEncoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("enc_input", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("attn_bias", [None, self.inputs[1].shape[1], None, None],
-                  "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_input"),
+            Input([None, self.inputs[1].shape[1], None, None], "float32",
+                  "attn_bias"),
         ]
         return inputs
 
@@ -336,14 +336,14 @@ class TestTransformerDecoder(TestTransformerEncoder):
 
     def make_inputs(self):
         inputs = [
-            Input("dec_input", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("self_attn_bias",
-                  [None, self.inputs[-1].shape[1], None, None], "float32"),
-            Input("cross_attn_bias",
-                  [None, self.inputs[-1].shape[1], None, None], "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "dec_input"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_output"),
+            Input([None, self.inputs[-1].shape[1], None, None], "float32",
+                  "self_attn_bias"),
+            Input([None, self.inputs[-1].shape[1], None, None], "float32",
+                  "cross_attn_bias"),
         ]
         return inputs
 
@@ -431,10 +431,10 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
-                  "float32"),
-            Input("trg_src_attn_bias",
-                  [None, self.inputs[1].shape[1], None, None], "float32"),
+            Input([None, None, self.inputs[0].shape[-1]], "float32",
+                  "enc_output"),
+            Input([None, self.inputs[1].shape[1], None, None], "float32",
+                  "trg_src_attn_bias"),
         ]
         return inputs
 
@@ -473,9 +473,9 @@ class TestSequenceTagging(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("word", [None, None], "int64"),
-            Input("lengths", [None], "int64"),
-            Input("target", [None, None], "int64"),
+            Input([None, None], "int64", "word"),
+            Input([None], "int64", "lengths"),
+            Input([None, None], "int64", "target"),
         ]
         return inputs
 
@@ -517,7 +517,7 @@ class TestStackedRNN(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -543,7 +543,7 @@ class TestLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -579,7 +579,7 @@ class TestBiLSTM(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -609,7 +609,7 @@ class TestGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -645,7 +645,7 @@ class TestBiGRU(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
+            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
         ]
         return inputs
 
@@ -680,7 +680,7 @@ class TestCNNEncoder(ModuleApiTest):
 
     def make_inputs(self):
         inputs = [
-            Input("input", [None, self.inputs[-1].shape[1], None], "float32"),
+            Input([None, self.inputs[-1].shape[1], None], "float32", "input"),
         ]
         return inputs
 
diff --git a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
index c2035a8b5c5958d54c79d6ee0ff6df654bb35d51..26ec53014b1c3b113a0e1ee82f3b9edfe9f48a3f 100644
--- a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
+++ b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 
 from paddle import fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential
 
 from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
 
@@ -32,11 +32,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
diff --git a/python/paddle/incubate/hapi/tests/test_vision_models.py b/python/paddle/incubate/hapi/tests/test_vision_models.py
index 16dbe431be801c9cd7ce48c4cd1444b7e0e558a4..2dc9355bcc3005d48b7046123b024fa2a91594c3 100644
--- a/python/paddle/incubate/hapi/tests/test_vision_models.py
+++ b/python/paddle/incubate/hapi/tests/test_vision_models.py
@@ -28,7 +28,7 @@ class TestVisonModels(unittest.TestCase):
         else:
             net = models.__dict__[arch](pretrained=pretrained)
 
-        input = hapi.Input('image', [None, 3, 224, 224], 'float32')
+        input = hapi.Input([None, 3, 224, 224], 'float32', 'image')
         model = hapi.Model(net, input)
         model.prepare()
 
@@ -71,7 +71,7 @@ class TestVisonModels(unittest.TestCase):
         self.models_infer('resnet152')
 
     def test_lenet(self):
-        input = hapi.Input('x', [None, 1, 28, 28], 'float32')
+        input = hapi.Input([None, 1, 28, 28], 'float32', 'x')
         lenet = hapi.Model(models.__dict__['LeNet'](), input)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/text/text.py b/python/paddle/incubate/hapi/text/text.py
index 1424ce0381ac22e3fc15db854e653e0c2632cf22..a2940fbe6cf483bce905c596a4b50294129fab54 100644
--- a/python/paddle/incubate/hapi/text/text.py
+++ b/python/paddle/incubate/hapi/text/text.py
@@ -1804,7 +1804,7 @@ class DynamicDecode(Layer):
             from paddle.fluid.layers import BeamSearchDecoder
             from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
 
-            paddle.enable_dygraph()
+            paddle.disable_static()
 
             vocab_size, d_model, = 100, 32
             encoder_output = paddle.rand((2, 4, d_model))
@@ -2278,7 +2278,7 @@ class TransformerCell(RNNCell):
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
             from paddle.incubate.hapi.text import DynamicDecode
 
-            paddle.enable_dygraph()
+            paddle.disable_static()
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
@@ -2445,7 +2445,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
             from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
             from paddle.incubate.hapi.text import DynamicDecode
 
-            paddle.enable_dygraph()
+            paddle.disable_static()
 
             class Embedder(fluid.dygraph.Layer):
                 def __init__(self):
diff --git a/python/paddle/incubate/hapi/vision/models/lenet.py b/python/paddle/incubate/hapi/vision/models/lenet.py
index db1d894b4aa5f2535795c6350faad6ee3aee1164..dc7b094de0f26e04b9f07d011d3ce492950df269 100644
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/incubate/hapi/vision/models/lenet.py
@@ -13,7 +13,7 @@
 #limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
+from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential
 
 __all__ = ['LeNet']
 
@@ -39,11 +39,11 @@ class LeNet(fluid.dygraph.Layer):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2D(
+            Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             Pool2D(2, 'max', 2),
-            Conv2D(
+            Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             Pool2D(2, 'max', 2))
diff --git a/python/paddle/incubate/hapi/vision/models/vgg.py b/python/paddle/incubate/hapi/vision/models/vgg.py
index 74e7228e5249fe990d037c9f12e75b6d4839c591..30f6e120b2502113045b3583686360f4ed2c32ac 100644
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/incubate/hapi/vision/models/vgg.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU
 from paddle.fluid.dygraph.container import Sequential
 
 from ...download import get_weights_path_from_url
@@ -105,12 +105,11 @@ def make_layers(cfg, batch_norm=False):
             layers += [Pool2D(pool_size=2, pool_stride=2)]
         else:
             if batch_norm:
-                conv2d = Conv2D(in_channels, v, filter_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v, act='relu')]
+                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, BatchNorm(v), ReLU()]
             else:
-                conv2d = Conv2D(
-                    in_channels, v, filter_size=3, padding=1, act='relu')
-                layers += [conv2d]
+                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
+                layers += [conv2d, ReLU()]
             in_channels = v
     return Sequential(*layers)
 
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index f9c083d2aeeeedb98b7d0ea2364dbb278ed26282..89bbd5916578b6e3169452d85e581c438f2bbb47 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -15,9 +15,14 @@
 # TODO: define all functions about input & output in this directory 
 __all__ = [
     'Dataset',
+    'IterableDataset',
     'BatchSampler',
     #            'Transform',
     'DataLoader',
+    'get_worker_info',
+    'Sampler',
+    'SequenceSampler',
+    'RandomSampler',
     'load',
     'save',
     'load_program_state',
@@ -36,7 +41,8 @@ __all__ = [
 ]
 
 from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, BatchSampler
+from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
+        Sampler, SequenceSampler, RandomSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
 from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..47369e3ff9cd87539f9e96708ff981dc67d06420
--- /dev/null
+++ b/python/paddle/jit/__init__.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import declarative as to_static  #DEFINE_ALIAS
+from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
+from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
+
+__all__ = [
+    'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
+    'ProgramTranslator', 'TranslatedLayer'
+]
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index e03336f6dbab7b375701e1e694aee0bbbfa4b1cd..6e197881fc0bcbc32f9d9d738237082138f9410b 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to calculate metric in this directory 
-__all__ = [
-    'Accuracy', 'Auc', 'ChunkEvaluator', 'CompositeMetric', 'DetectionMAP',
-    'EditDistance', 'Precision', 'Recall', 'accuracy', 'auc', 'chunk_eval',
-    'cos_sim', 'mean_iou'
-]
-
-
-
-from ..fluid.metrics import Accuracy, Auc, ChunkEvaluator, CompositeMetric, DetectionMAP, EditDistance, \
-        Precision, Recall
+from .metrics import *
+from . import metrics
 
 from ..fluid.layers.metric_op import accuracy, auc
 from ..fluid.layers.nn import chunk_eval, cos_sim, mean_iou
+
+__all__ = metrics.__all__ + [
+    'accuracy',
+    'auc',
+    'chunk_eval',
+    'cos_sim',
+    'mean_iou',
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..110a62c300559b9037cd2ca735aebd1946ba0ce9
--- /dev/null
+++ b/python/paddle/metric/metrics.py
@@ -0,0 +1,738 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import abc
+import numpy as np
+
+import paddle
+
+__all__ = ['Metric', 'Accuracy', 'Precision', 'Recall', 'Auc']
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+    """
+    Base class for metric, encapsulates metric logic and APIs
+    Usage:
+        
+        m = SomeMetric()
+        for prediction, label in ...:
+            m.update(prediction, label)
+        m.accumulate()
+        
+    Advanced usage for :code:`compute`:
+
+    Metric calculation can be accelerated by calculating metric states
+    from model outputs and labels by build-in operators not by Python/NumPy
+    in :code:`compute`, metric states will be fetched as NumPy array and
+    call :code:`update` with states in NumPy format.
+    Metric calculated as follows (operations in Model and Metric are
+    indicated with curly brackets, while data nodes not):
+                 inputs & labels              || ------------------
+                       |                      ||
+                    {model}                   ||
+                       |                      ||
+                outputs & labels              ||
+                       |                      ||    tensor data
+                {Metric.compute}              ||
+                       |                      ||
+              metric states(tensor)           ||
+                       |                      ||
+                {fetch as numpy}              || ------------------
+                       |                      ||
+              metric states(numpy)            ||    numpy data
+                       |                      ||
+                {Metric.update}               \/ ------------------
+    Examples:
+        
+        For :code:`Accuracy` metric, which takes :code:`pred` and :code:`label`
+        as inputs, we can calculate the correct prediction matrix between
+        :code:`pred` and :code:`label` in :code:`compute`.
+        For examples, prediction results contains 10 classes, while :code:`pred`
+        shape is [N, 10], :code:`label` shape is [N, 1], N is mini-batch size,
+        and we only need to calculate accurary of top-1 and top-5, we could
+        calculate the correct prediction matrix of the top-5 scores of the
+        prediction of each sample like follows, while the correct prediction
+        matrix shape is [N, 5].
+
+        .. code-block:: python
+            def compute(pred, label):
+                # sort prediction and slice the top-5 scores
+                pred = paddle.argsort(pred, descending=True)[:, :5]
+                # calculate whether the predictions are correct
+                correct = pred == label
+                return paddle.cast(correct, dtype='float32')
+
+        With the :code:`compute`, we split some calculations to OPs (which
+        may run on GPU devices, will be faster), and only fetch 1 tensor with
+        shape as [N, 5] instead of 2 tensors with shapes as [N, 10] and [N, 1].
+        :code:`update` can be define as follows:
+
+        .. code-block:: python
+            def update(self, correct):
+                accs = []
+                for i, k in enumerate(self.topk):
+                    num_corrects = correct[:, :k].sum()
+                    num_samples = len(correct)
+                    accs.append(float(num_corrects) / num_samples)
+                    self.total[i] += num_corrects
+                    self.count[i] += num_samples
+                return accs
+    """
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset states and result
+        """
+        raise NotImplementedError("function 'reset' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def update(self, *args):
+        """
+        Update states for metric
+
+        Inputs of :code:`update` is the outputs of :code:`Metric.compute`,
+        if :code:`compute` is not defined, the inputs of :code:`update`
+        will be flatten arguments of **output** of mode and **label** from data:
+        :code:`update(output1, output2, ..., label1, label2,...)`
+
+        see :code:`Metric.compute`
+        """
+        raise NotImplementedError("function 'update' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def accumulate(self):
+        """
+        Accumulates statistics, computes and returns the metric value
+        """
+        raise NotImplementedError(
+            "function 'accumulate' not implemented in {}.".format(
+                self.__class__.__name__))
+
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns metric name
+        """
+        raise NotImplementedError("function 'name' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    def compute(self, *args):
+        """
+        This API is advanced usage to accelerate metric calculating, calulations
+        from outputs of model to the states which should be updated by Metric can
+        be defined here, where Paddle OPs is also supported. Outputs of this API
+        will be the inputs of "Metric.update".
+
+        If :code:`compute` is defined, it will be called with **outputs**
+        of model and **labels** from data as arguments, all outputs and labels
+        will be concatenated and flatten and each filed as a separate argument
+        as follows:
+        :code:`compute(output1, output2, ..., label1, label2,...)`
+
+        If :code:`compute` is not defined, default behaviour is to pass
+        input to output, so output format will be:
+        :code:`return output1, output2, ..., label1, label2,...`
+
+        see :code:`Metric.update`
+        """
+        return args
+
+
+class Accuracy(Metric):
+    """
+    Encapsulates accuracy metric logic.
+
+    Args:
+        topk (int|tuple(int)): Number of top elements to look at
+            for computing accuracy. Default is (1,).
+        name (str, optional): String name of the metric instance. Default
+            is `acc`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        paddle.disable_static()
+        x = paddle.to_tensor(np.array([
+            [0.1, 0.2, 0.3, 0.4],
+            [0.1, 0.4, 0.3, 0.2],
+            [0.1, 0.2, 0.4, 0.3],
+            [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        m = paddle.metric.Accuracy()
+        correct = m.compute(x, y)
+        m.update(correct)
+        res = m.accumulate()
+        print(res) # 0.75
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import paddle
+        import paddle.incubate.hapi as hapi
+
+        paddle.disable_static()
+        train_dataset = hapi.datasets.MNIST(mode='train')
+
+        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=paddle.nn.CrossEntropyLoss(),
+            metrics=paddle.metric.Accuracy())
+
+        model.fit(train_dataset, batch_size=64)
+
+    """
+
+    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
+        super(Accuracy, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.maxk = max(topk)
+        self._init_name(name)
+        self.reset()
+
+    def compute(self, pred, label, *args):
+        """
+        Compute the top-k (maxinum value in `topk`) indices.
+
+        Args:
+            pred (Tensor): The predicted value is a Tensor wit type
+                float32 or float64.
+            label (Tensor): The ground truth value is a 2D Tensor, its
+                shape is [batch_size, 1] and type is int64.
+
+        Return:
+            Tensor: Correct mask, a tensor with shape [batch_size, topk].
+        """
+        pred = paddle.argsort(pred, descending=True)[:, :self.maxk]
+        correct = pred == label
+        return paddle.cast(correct, dtype='float32')
+
+    def update(self, correct, *args):
+        """
+        Update the metrics states (correct count and total count), in order to
+        calculate cumulative accuracy of all instances. This function also
+        returns the accuracy of current step.
+        
+        Args:
+            correct: Correct mask, a tensor with shape [batch_size, topk].
+
+        Return:
+            Tensor: the accuracy of current step.
+        """
+        if isinstance(correct, paddle.Tensor):
+            correct = correct.numpy()
+        accs = []
+        for i, k in enumerate(self.topk):
+            num_corrects = correct[:, :k].sum()
+            num_samples = len(correct)
+            accs.append(float(num_corrects) / num_samples)
+            self.total[i] += num_corrects
+            self.count[i] += num_samples
+        accs = accs[0] if len(self.topk) == 1 else accs
+        return accs
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.total = [0.] * len(self.topk)
+        self.count = [0] * len(self.topk)
+
+    def accumulate(self):
+        """
+        Computes and returns the accumulated metric.
+        """
+        res = []
+        for t, c in zip(self.total, self.count):
+            r = float(t) / c if c > 0 else 0.
+            res.append(r)
+        res = res[0] if len(self.topk) == 1 else res
+        return res
+
+    def _init_name(self, name):
+        name = name or 'acc'
+        if self.maxk != 1:
+            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
+        else:
+            self._name = [name]
+
+    def name(self):
+        """
+        Return name of metric instance.
+        """
+        return self._name
+
+
+class Precision(Metric):
+    """
+    Precision (also called positive predictive value) is the fraction of
+    relevant instances among the retrieved instances. Refer to
+    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
+
+    Noted that this class manages the precision score only for binary
+    classification task.
+
+    Args:
+        name (str, optional): String name of the metric instance.
+            Default is `precision`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([0, 1, 1, 1])
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        res = m.accumulate()
+        print(res) # 1.0
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+  
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 1),
+            nn.Sigmoid()
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=nn.BCELoss(),
+            metrics=paddle.metric.Precision())
+        
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self, name='precision', *args, **kwargs):
+        super(Precision, self).__init__(*args, **kwargs)
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the states based on the current mini-batch prediction results.
+
+        Args:
+            preds (numpy.ndarray): The prediction result, usually the output
+                of two-class sigmoid function. It should be a vector (column
+                vector or row vector) with data type: 'float64' or 'float32'.
+            labels (numpy.ndarray): The ground truth (labels),
+                the shape should keep the same as preds.
+                The data type is 'int32' or 'int64'.
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        sample_num = labels.shape[0]
+        preds = np.floor(preds + 0.5).astype("int32")
+
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if pred == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fp = 0
+
+    def accumulate(self):
+        """
+        Calculate the final precision.
+
+        Returns:
+            A scaler float: results of the calculated precision.
+        """
+        ap = self.tp + self.fp
+        return float(self.tp) / ap if ap != 0 else .0
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
+
+
+class Recall(Metric):
+    """
+    Recall (also known as sensitivity) is the fraction of
+    relevant instances that have been retrieved over the
+    total amount of relevant instances
+
+    Refer to:
+    https://en.wikipedia.org/wiki/Precision_and_recall
+
+    Noted that this class manages the recall score only for
+    binary classification task.
+
+    Args:
+        name (str, optional): String name of the metric instance.
+            Default is `recall`.
+
+    Example by standalone:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Recall()
+        m.update(x, y)
+        res = m.accumulate()
+        print(res) # 2.0 / 3.0
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+        
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 1),
+            nn.Sigmoid()
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        model.prepare(
+            optim,
+            loss=nn.BCELoss(),
+            metrics=[paddle.metric.Precision(), paddle.metric.Recall()])
+        
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self, name='recall', *args, **kwargs):
+        super(Recall, self).__init__(*args, **kwargs)
+        self.tp = 0  # true positive
+        self.fn = 0  # false negative
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the states based on the current mini-batch prediction results.
+
+        Args:
+            preds(numpy.array): prediction results of current mini-batch,
+                the output of two-class sigmoid function.
+                Shape: [batch_size, 1]. Dtype: 'float64' or 'float32'.
+            labels(numpy.array): ground truth (labels) of current mini-batch,
+                the shape should keep the same as preds.
+                Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        sample_num = labels.shape[0]
+        preds = np.rint(preds).astype("int32")
+
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fn += 1
+
+    def accumulate(self):
+        """
+        Calculate the final recall.
+
+        Returns:
+            A scaler float: results of the calculated Recall.
+        """
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else .0
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fn = 0
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
+
+
+class Auc(Metric):
+    """
+    The auc metric is for binary classification.
+    Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve.
+    Please notice that the auc metric is implemented with python, which may be a little bit slow.
+
+    The `auc` function creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the AUC. To discretize the AUC curve, a linearly spaced set of
+    thresholds is used to compute pairs of recall and precision values. The area
+    under the ROC-curve is therefore computed using the height of the recall
+    values by the false positive rate, while the area under the PR-curve is the
+    computed using the height of the precision values by the recall.
+
+    Args:
+        curve (str): Specifies the mode of the curve to be computed,
+            'ROC' or 'PR' for the Precision-Recall-curve. Default is 'ROC'.
+        num_thresholds (int): The number of thresholds to use when
+            discretizing the roc curve. Default is 4095.
+            'ROC' or 'PR' for the Precision-Recall-curve. Default is 'ROC'.
+        name (str, optional): String name of the metric instance. Default
+            is `auc`.
+
+    "NOTE: only implement the ROC curve type via Python now."
+
+    Example by standalone:
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+
+        m = paddle.metric.Auc()
+        
+        n = 8
+        class0_preds = np.random.random(size = (n, 1))
+        class1_preds = 1 - class0_preds
+        
+        preds = np.concatenate((class0_preds, class1_preds), axis=1)
+        labels = np.random.randint(2, size = (n, 1))
+        
+        m.update(preds=preds, labels=labels)
+        res = m.accumulate()
+
+
+    Example with Model API:
+        
+        .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn as nn
+        import paddle.incubate.hapi as hapi
+        
+        class Data(paddle.io.Dataset):
+            def __init__(self):
+                super(Data, self).__init__()
+                self.n = 1024
+                self.x = np.random.randn(self.n, 10).astype('float32')
+                self.y = np.random.randint(2, size=(self.n, 1)).astype('int64')
+        
+            def __getitem__(self, idx):
+                return self.x[idx], self.y[idx]
+        
+            def __len__(self):
+                return self.n
+        
+        paddle.disable_static()
+        model = hapi.Model(nn.Sequential(
+            nn.Linear(10, 2, act='softmax'),
+        ))
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        
+        def loss(x, y):
+            return nn.functional.nll_loss(paddle.log(x), y)
+        
+        model.prepare(
+            optim,
+            loss=loss,
+            metrics=paddle.metric.Auc())
+        data = Data()
+        model.fit(data, batch_size=16)
+    """
+
+    def __init__(self,
+                 curve='ROC',
+                 num_thresholds=4095,
+                 name='auc',
+                 *args,
+                 **kwargs):
+        super(Auc, self).__init__(*args, **kwargs)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+        self._name = name
+
+    def update(self, preds, labels):
+        """
+        Update the auc curve with the given predictions and labels.
+
+        Args:
+            preds (numpy.array): An numpy array in the shape of
+                (batch_size, 2), preds[i][j] denotes the probability of
+                classifying the instance i into the class j.
+            labels (numpy.array): an numpy array in the shape of
+                (batch_size, 1), labels[i] is either o or 1,
+                representing the label of the instance i.
+        """
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        elif not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
+
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        elif not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
+
+        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
+            if lbl:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
+
+    def accumulate(self):
+        """
+        Return the area (a float score) under auc curve
+
+        Return:
+            float: the area under auc curve
+        """
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+
+    def reset(self):
+        """
+        Reset states and result
+        """
+        _num_pred_buckets = self._num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index e074ca66bb1d3700cc2e50db2b1439e991113f39..07b3f0d284dcd28d4967131ab85bb2ca3cd1d6da 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -18,6 +18,7 @@
 from .layer import norm
 from .functional import extension
 from .layer import common
+from .utils import weight_norm_hook
 
 from . import initializer
 
@@ -25,6 +26,7 @@ __all__ = []
 __all__ += norm.__all__
 __all__ += extension.__all__
 __all__ += common.__all__
+__all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
 # from .clip import ErrorClipByValue        #DEFINE_ALIAS
@@ -49,25 +51,56 @@ from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
-from .input import data  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
-# from .layer.activation import PReLU        #DEFINE_ALIAS
-from .layer.activation import ReLU  #DEFINE_ALIAS
+from .layer.activation import ELU
+from .layer.activation import GELU
+from .layer.activation import Tanh
+from .layer.activation import Hardshrink
+from .layer.activation import Hardtanh
+from .layer.activation import PReLU
+from .layer.activation import ReLU
+from .layer.activation import ReLU6  #DEFINE_ALIAS
+from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
-# from .layer.activation import Softmax        #DEFINE_ALIAS
+from .layer.activation import LogSigmoid
+from .layer.activation import Softmax  #DEFINE_ALIAS
+from .layer.activation import Softplus  #DEFINE_ALIAS
+from .layer.activation import Softshrink  #DEFINE_ALIAS
+from .layer.activation import Softsign  #DEFINE_ALIAS
+from .layer.activation import Tanhshrink  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 from .layer.activation import HSigmoid  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
+from .layer.common import ReflectionPad1d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad1d  #DEFINE_ALIAS
+from .layer.common import ConstantPad1d  #DEFINE_ALIAS
+from .layer.common import ReflectionPad2d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad2d  #DEFINE_ALIAS
+from .layer.common import ConstantPad2d  #DEFINE_ALIAS
+from .layer.common import ZeroPad2d  #DEFINE_ALIAS
+from .layer.common import ReplicationPad3d  #DEFINE_ALIAS
+from .layer.common import ConstantPad3d  #DEFINE_ALIAS
+from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
+from .layer.common import Flatten  #DEFINE_ALIAS
 from .layer.common import UpSample  #DEFINE_ALIAS
-from .layer.conv import Conv2D  #DEFINE_ALIAS
-from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv3D  #DEFINE_ALIAS
-from .layer.conv import Conv3DTranspose  #DEFINE_ALIAS
+from .layer.common import Bilinear  #DEFINE_ALIAS
+from .layer.common import Dropout  #DEFINE_ALIAS
+from .layer.common import Dropout2D  #DEFINE_ALIAS
+from .layer.common import Dropout3D  #DEFINE_ALIAS
+from .layer.common import AlphaDropout  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .layer.conv import Conv1d  #DEFINE_ALIAS
+from .layer.conv import Conv2d  #DEFINE_ALIAS
+from .layer.conv import Conv3d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose1d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose2d  #DEFINE_ALIAS
+from .layer.conv import ConvTranspose3d  #DEFINE_ALIAS
 # from .layer.conv import TreeConv        #DEFINE_ALIAS
 # from .layer.conv import Conv1D        #DEFINE_ALIAS
 from .layer.extension import RowConv  #DEFINE_ALIAS
@@ -79,21 +112,43 @@ from .layer.extension import RowConv  #DEFINE_ALIAS
 # from .layer.learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .layer.learning_rate import PolynomialDecay        #DEFINE_ALIAS
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
+from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
 from .layer.loss import MSELoss  #DEFINE_ALIAS
 from .layer.loss import L1Loss  #DEFINE_ALIAS
 from .layer.loss import NLLLoss  #DEFINE_ALIAS
 from .layer.loss import BCELoss  #DEFINE_ALIAS
+from .layer.loss import KLDivLoss  #DEFINE_ALIAS
+from .layer.loss import MarginRankingLoss  #DEFINE_ALIAS
+from .layer.loss import CTCLoss  #DEFINE_ALIAS
+from .layer.loss import SmoothL1Loss  #DEFINE_ALIAS
 from .layer.norm import BatchNorm  #DEFINE_ALIAS
+from .layer.norm import SyncBatchNorm  #DEFINE_ALIAS
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
+from .layer.transformer import MultiHeadAttention
+from .layer.transformer import TransformerEncoderLayer
+from .layer.transformer import TransformerEncoder
+from .layer.transformer import TransformerDecoderLayer
+from .layer.transformer import TransformerDecoder
+from .layer.transformer import Transformer
+from .layer.distance import PairwiseDistance  #DEFINE_ALIAS
+
+from .layer.vision import PixelShuffle
 
 from .layer import loss  #DEFINE_ALIAS
 from .layer import conv  #DEFINE_ALIAS
+from .layer import vision  #DEFINE_ALIAS
 from ..fluid.dygraph.layers import Layer  #DEFINE_ALIAS
 from ..fluid.dygraph.container import LayerList, ParameterList, Sequential  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 3fefb1b053ee88b01b3a3ca71918ffafdeb71ff2..97a4d5432bdc24912a851741328516e9269a64c2 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -25,18 +25,23 @@ from . import extension
 __all__ += extension.__all__
 from . import common
 __all__ += common.__all__
+from . import pooling
+__all__ += pooling.__all__
+from . import loss
+__all__ += loss.__all__
 from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
-from .activation import hard_shrink  #DEFINE_ALIAS
+from .activation import hardshrink  #DEFINE_ALIAS
+from .activation import hardtanh  #DEFINE_ALIAS
 from .activation import hard_sigmoid  #DEFINE_ALIAS
 from .activation import hard_swish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import logsigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
-# from .activation import prelu        #DEFINE_ALIAS
+from .activation import prelu  #DEFINE_ALIAS
 from .activation import relu  #DEFINE_ALIAS
 from .activation import relu6  #DEFINE_ALIAS
 from .activation import selu  #DEFINE_ALIAS
@@ -47,10 +52,14 @@ from .activation import softplus  #DEFINE_ALIAS
 from .activation import softshrink  #DEFINE_ALIAS
 from .activation import softsign  #DEFINE_ALIAS
 from .activation import swish  #DEFINE_ALIAS
-from .activation import tanh_shrink  #DEFINE_ALIAS
+from .activation import tanh  #DEFINE_ALIAS
+from .activation import tanhshrink  #DEFINE_ALIAS
 from .activation import thresholded_relu  #DEFINE_ALIAS
 from .activation import log_softmax  #DEFINE_ALIAS
 from .common import dropout  #DEFINE_ALIAS
+from .common import dropout2d  #DEFINE_ALIAS
+from .common import dropout3d  #DEFINE_ALIAS
+from .common import alpha_dropout  #DEFINE_ALIAS
 # from .common import embedding        #DEFINE_ALIAS
 # from .common import fc  #DEFINE_ALIAS
 from .common import label_smooth  #DEFINE_ALIAS
@@ -58,14 +67,18 @@ from .common import one_hot  #DEFINE_ALIAS
 from .common import pad  #DEFINE_ALIAS
 from .common import pad_constant_like  #DEFINE_ALIAS
 from .common import pad2d  #DEFINE_ALIAS
+from .common import cosine_similarity  #DEFINE_ALIAS
 from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
 from .common import assign  #DEFINE_ALIAS
 from .common import interpolate  #DEFINE_ALIAS
+from .common import bilinear  #DEFINE_ALIAS
+from .conv import conv1d  #DEFINE_ALIAS
+from .conv import conv_transpose1d  #DEFINE_ALIAS
 from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv2d_transpose  #DEFINE_ALIAS
+from .conv import conv_transpose2d  #DEFINE_ALIAS
 from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv3d_transpose  #DEFINE_ALIAS
+from .conv import conv_transpose3d  #DEFINE_ALIAS
 from .extension import add_position_encoding  #DEFINE_ALIAS
 # from .extension import autoincreased_step_counter        #DEFINE_ALIAS
 from .extension import continuous_value_model  #DEFINE_ALIAS
@@ -119,6 +132,8 @@ from .lod import hash  #DEFINE_ALIAS
 # from .lod import dynamic_gru        #DEFINE_ALIAS
 # from .lod import dynamic_lstm        #DEFINE_ALIAS
 # from .lod import dynamic_lstmp        #DEFINE_ALIAS
+from .loss import binary_cross_entropy  #DEFINE_ALIAS
+from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import bpr_loss  #DEFINE_ALIAS
 from .loss import center_loss  #DEFINE_ALIAS
 from .loss import cross_entropy  #DEFINE_ALIAS
@@ -126,10 +141,12 @@ from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import edit_distance  #DEFINE_ALIAS
 from .loss import huber_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
-from .loss import kldiv_loss  #DEFINE_ALIAS
+from .loss import kl_div  #DEFINE_ALIAS
+from .loss import l1_loss  #DEFINE_ALIAS
 from .loss import log_loss  #DEFINE_ALIAS
-from .loss import margin_rank_loss  #DEFINE_ALIAS
+from .loss import margin_ranking_loss  #DEFINE_ALIAS
 from .loss import mse_loss  #DEFINE_ALIAS
+from .loss import nll_loss  #DEFINE_ALIAS
 # from .loss import nce        #DEFINE_ALIAS
 from .loss import npair_loss  #DEFINE_ALIAS
 from .loss import rank_loss  #DEFINE_ALIAS
@@ -137,22 +154,35 @@ from .loss import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import sigmoid_focal_loss  #DEFINE_ALIAS
 from .loss import smooth_l1  #DEFINE_ALIAS
+from .loss import smooth_l1_loss  #DEFINE_ALIAS
 from .loss import softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
-# from .norm import batch_norm        #DEFINE_ALIAS
+from .loss import ctc_loss  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
-# from .norm import instance_norm        #DEFINE_ALIAS
 from .norm import l2_normalize  #DEFINE_ALIAS
-# from .norm import layer_norm        #DEFINE_ALIAS
+from .norm import batch_norm  #DEFINE_ALIAS
+from .norm import instance_norm  #DEFINE_ALIAS
+from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
+from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
+from .pooling import max_pool1d  #DEFINE_ALIAS
+from .pooling import avg_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
 from .pooling import pool2d  #DEFINE_ALIAS
 from .pooling import pool3d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
+from .pooling import avg_pool2d  #DEFINE_ALIAS
+from .pooling import max_pool2d  #DEFINE_ALIAS
+from .pooling import avg_pool3d  #DEFINE_ALIAS
+from .pooling import max_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
+from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
@@ -164,7 +194,7 @@ from .vision import box_clip  #DEFINE_ALIAS
 from .vision import box_coder  #DEFINE_ALIAS
 from .vision import box_decoder_and_assign  #DEFINE_ALIAS
 from .vision import collect_fpn_proposals  #DEFINE_ALIAS
-# from .vision import deformable_conv        #DEFINE_ALIAS
+# from .vision import deformable_conv  #DEFINE_ALIAS
 from .vision import deformable_roi_pooling  #DEFINE_ALIAS
 from .vision import density_prior_box  #DEFINE_ALIAS
 from .vision import detection_output  #DEFINE_ALIAS
@@ -173,10 +203,10 @@ from .vision import fsp_matrix  #DEFINE_ALIAS
 from .vision import generate_mask_labels  #DEFINE_ALIAS
 from .vision import generate_proposal_labels  #DEFINE_ALIAS
 from .vision import generate_proposals  #DEFINE_ALIAS
-from .vision import grid_sampler  #DEFINE_ALIAS
+from .vision import grid_sample  #DEFINE_ALIAS
 from .vision import image_resize  #DEFINE_ALIAS
 from .vision import image_resize_short  #DEFINE_ALIAS
-# from .vision import multi_box_head        #DEFINE_ALIAS
+# from .vision import multi_box_head  #DEFINE_ALIAS
 from .vision import pixel_shuffle  #DEFINE_ALIAS
 from .vision import prior_box  #DEFINE_ALIAS
 from .vision import prroi_pool  #DEFINE_ALIAS
@@ -193,3 +223,4 @@ from .vision import shuffle_channel  #DEFINE_ALIAS
 from .vision import space_to_depth  #DEFINE_ALIAS
 from .vision import yolo_box  #DEFINE_ALIAS
 from .vision import yolov3_loss  #DEFINE_ALIAS
+from .input import one_hot  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index dad6b252ed4d54db4a85d1912503ce5401a1ca4c..2e399db2a9aba4edce5ebd42df83df16937a80d9 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -14,59 +14,255 @@
 
 # TODO: define activation functions of neural network
 from ...fluid.layers import brelu  #DEFINE_ALIAS
-from ...fluid.layers import elu  #DEFINE_ALIAS
 from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import gelu  #DEFINE_ALIAS
-from ...fluid.layers import hard_shrink  #DEFINE_ALIAS
 from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
 from ...fluid.layers import hard_swish  #DEFINE_ALIAS
-from ...fluid.layers import leaky_relu  #DEFINE_ALIAS
-from ...fluid.layers import logsigmoid  #DEFINE_ALIAS
 from ...fluid.layers import maxout  #DEFINE_ALIAS
-from ...fluid.layers import relu6  #DEFINE_ALIAS
-from ...fluid.layers import selu  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import softmax  #DEFINE_ALIAS
-from ...fluid.layers import softplus  #DEFINE_ALIAS
-from ...fluid.layers import softshrink  #DEFINE_ALIAS
-from ...fluid.layers import softsign  #DEFINE_ALIAS
 from ...fluid.layers import swish  #DEFINE_ALIAS
-from ...fluid.layers import tanh_shrink  #DEFINE_ALIAS
+from ...fluid.layers import sigmoid  #DEFINE_ALIAS
 from ...fluid.layers import thresholded_relu  #DEFINE_ALIAS
+from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
     'brelu',
     'elu',
     'erf',
     'gelu',
-    'hard_shrink',
+    'hardshrink',
+    'hardtanh',
     'hard_sigmoid',
     'hard_swish',
     'hsigmoid',
     'leaky_relu',
     'logsigmoid',
     'maxout',
-    #       'prelu',
+    'prelu',
     'relu',
     'relu6',
     'selu',
-    'sigmoid',
     'soft_relu',
     'softmax',
     'softplus',
     'softshrink',
     'softsign',
+    'sigmoid',
     'swish',
-    'tanh_shrink',
+    'tanh',
+    'tanhshrink',
     'thresholded_relu',
-    'log_softmax'
+    'log_softmax',
 ]
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
 from ...fluid import core
-from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+import paddle
+
+
+def elu(x, alpha=1.0, name=None):
+    """
+    elu activation.
+
+    .. math::
+
+        elu(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            out = F.elu(x, alpha=0.2) 
+            # [[-0.12642411  6.        ]
+            #  [ 1.          15.6      ]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.elu(x, 'alpha', alpha)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
+    helper = LayerHelper("elu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha})
+    return out
+
+
+def gelu(x, approximate=False, name=None):
+    """
+    gelu activation.
+
+    if approximate is True
+
+    .. math::
+
+        gelu(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    else
+
+    .. math::
+
+        gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        approximate (bool, optional): Wether to enable approximation. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            out1 = F.gelu(x) # [-0.158655 0.345731 0.841345 1.39979]
+            out2 = F.gelu(x, True) # [-0.158808 0.345714 0.841192 1.39957]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.gelu(x, 'approximate', approximate)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
+    helper = LayerHelper("gelu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='gelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'approximate': approximate})
+    return out
+
+
+def hardshrink(x, threshold=0.5, name=None):
+    """
+    hard shrinkage activation
+
+    .. math::
+
+        hardshrink(x)=
+            \left\{
+            \begin{aligned}
+            &x, & & if \ x > threshold \\
+            &x, & & if \ x < -threshold \\
+            &0, & & if \ others
+            \end{aligned}
+            \right.
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+            out = F.hardshrink(x) # [-1., 0., 2.5]
+
+    """
+    if in_dygraph_mode():
+        return core.ops.hard_shrink(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardshrink')
+    helper = LayerHelper('hardshrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='hard_shrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
+def hardtanh(x, min=-1.0, max=1.0, name=None):
+    """
+    hardtanh activation
+
+    .. math::
+
+        hardtanh(x)= \\begin{cases}
+                        max, \\text{if } x > max \\\\
+                        min, \\text{if } x < min \\\\
+                        x,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        min (float, optional): The minimum value of the linear region range. Default is -1.
+        max (float, optional): The maximum value of the linear region range. Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            out = F.hardtanh(x) # [-1., 0.3, 1.]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.brelu(x, 't_min', min, 't_max', max)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardtanh')
+
+    helper = LayerHelper('hardtanh', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': min,
+               't_max': max})
+    return out
 
 
 def hsigmoid(input,
@@ -126,7 +322,6 @@ def hsigmoid(input,
         Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
 
     Examples:
-
         .. code-block:: python
 
             from paddle import fluid, nn
@@ -192,192 +387,454 @@ def hsigmoid(input,
     return out
 
 
-def relu(input, inplace=False, name=None):
+def leaky_relu(x, negative_slope=0.01, name=None):
     """
-	:alias_main: paddle.nn.functional.relu
-	:alias: paddle.nn.functional.relu,paddle.nn.functional.activation.relu
-
-    ReLU Activation.
+    leaky_relu activation
 
     .. math:
+        leaky_relu(x)=
+            \left\{
+            \begin{aligned}
+            &x, & & if \ x >= 0 \\
+            &negative\_slope * x, & & otherwise \\
+            \end{aligned}
+            \right. \\
 
-        out = max(x, 0)
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        negative_slope (float, optional): Slope of the activation function at
+            :math:`x < 0` . Default is 0.01.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            out = F.leaky_relu(x) # [-0.02, 0., 1.]
+
+    """
+    if in_dygraph_mode():
+        return core.ops.leaky_relu(x, 'alpha', negative_slope)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'leaky_relu')
+    helper = LayerHelper('leaky_relu', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='leaky_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': negative_slope})
+    return out
+
+
+def prelu(x, weight, name=None):
+    """
+    prelu activation.
+
+    .. math::
+
+        prelu(x) = max(0, x) + weight * min(0, x)
 
     Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float16, float32, or float64.
-        inplace (bool, optional): If inplace is True, the input and output of ``ReLU`` are the same variable.
-            Otherwise, the input and output of ``ReLU`` are different variables. Default: False. Note that if x is
-            more than one OPs' input, inplace must be False.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+        x (Tensor): The input Tensor with data type float32, float64.
+        weight (Tensor): The learnable parameter with data type same as ``x``.
+            The weight shape is [1] or [in], where `in` is the input channel of ``x``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Output of relu operator, a Tensor with shape same as input
+        A Tensor with the same data type and shape as ``x`` .
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn.functional as functional
-          import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-          data = np.array([-2, 0, 1]).astype('float32')
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = functional.relu(data)  # [0, 0, 1]
+            paddle.disable_static()
+
+            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
+                               [ 3.0, -4.0,  5.0, -6.0],
+                               [-7.0, -8.0,  8.0,  9.0]],
+                              [[ 1.0, -2.0, -3.0,  4.0],
+                               [-5.0,  6.0,  7.0, -8.0],
+                               [ 6.0,  7.0,  8.0,  9.0]]]], 'float32')
+            x = paddle.to_tensor(data)
+            w = paddle.to_tensor(np.array([0.25]).astype('float32'))
+            out = F.prelu(x, w)
+            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
+            #    [ 3.  , -1.  ,  5.  , -1.5 ],
+            #    [-1.75, -2.  ,  8.  ,  9.  ]],
+            #   [[ 1.  , -0.5 , -0.75,  4.  ],
+            #    [-1.25,  6.  ,  7.  , -2.  ],
+            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
+    check_variable_and_dtype(weight, 'weight',
+                             ['float16', 'float32', 'float64'], 'prelu')
+
+    helper = LayerHelper('prelu', **locals())
+    assert len(weight.shape
+               ) == 1, "The dim count of weight shape should be 1 in prelu()."
+
+    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # which means x.shape[0] is batch_size and x.shape[0] is channel.
+    mode = 'all'
+    if weight.shape[0] > 1:
+        assert len(
+            x.shape
+        ) > 1, "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
+        assert weight.shape[0] == x.shape[
+            1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+        mode = 'channel'
 
     if in_dygraph_mode():
-        if inplace:
-            warnings.warn(
-                "Inplace on ReLU is not allowed and will be discarded in dygraph mode currently."
-            )
-        return core.ops.relu(input)
+        return core.ops.prelu(x, weight, 'mode', mode)
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x,
+                "Alpha": weight},
+        outputs={"Out": out},
+        attrs={"mode": mode})
+    return out
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'relu')
 
-    helper = LayerHelper('relu', **locals())
-    outs = input if inplace else helper.create_variable_for_type_inference(
-        input.dtype)
-    helper.append_op(type='relu', inputs={'X': [input]}, outputs={'Out': outs})
-    return outs
+def relu(x, name=None):
+    """
+    relu activation.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
 
-def sigmoid(input, inplace=False, name=None):
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            out = F.relu(x) # [0., 0., 1.]
     """
-	:alias_main: paddle.nn.functional.sigmoid
-	:alias: paddle.nn.functional.sigmoid,paddle.nn.functional.activation.sigmoid
 
-    Sigmoid Activation.
+    if in_dygraph_mode():
+        return core.ops.relu(x)
 
-    .. math:
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
+    helper = LayerHelper('relu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='relu', inputs={'X': x}, outputs={'Out': out})
+    return out
 
-        output = \frac{1}{1 + e^{-input}}
+
+def logsigmoid(x, name=None):
+    """
+    logsigmoid activation.
+
+    .. math::
+
+        logsigmoid(x) = log \\frac{1}{1 + e^{-x}}
     
     Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float16, float32, or float64.
-        inplace (bool, optional): If inplace is True, the input and output are the same variable.
-            Otherwise, the input and output of are different variables. Default: False. Note that if x is
-            more than one OPs' input, inplace must be False.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
-        Output of sigmoid operator, a Tensor with shape same as input
+        A Tensor with the same data type and shape as ``x`` .
     
     Examples:
         .. code-block:: python
-          
-          import paddle.fluid as fluid
-          import paddle.nn.functional as functional
-          import numpy as np
-          # In the static graph mode
-          input = fluid.data(name="input", shape=[None, 4])
-          output = functional.sigmoid(input)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-          input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
-          output_data = exe.run(feed={"input": input_data},
-                                fetch_list=[output])
-          print(output_data) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
-          # In the dynamic graph mode
-          with fluid.dygraph.guard():
-              input = fluid.dygraph.to_variable(input_data)
-              output = functional.sigmoid(input)
-              print(output) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            out = F.logsigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.logsigmoid(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'logsigmoid')
+    helper = LayerHelper("logsigmoid", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def relu6(x, name=None):
+    """
+    relu6 activation
+
+    .. math::
+
+        relu6(x) = min(max(0,x), 6)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
+            out = F.relu6(x) # [0, 0.3, 6]
     """
+    threshold = 6.0
+    if in_dygraph_mode():
+        return core.ops.relu6(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
+    helper = LayerHelper('relu6', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
+def selu(x,
+         scale=1.0507009873554804934193349852946,
+         alpha=1.6732632423543772848170429916717,
+         name=None):
+    """
+    selu activation
+
+    .. math::
+
+        selu(x)= scale *
+                 \\begin{cases}
+                   x, \\text{if } x > 0 \\\\
+                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                 \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        scale (float, optional): The value of scale for selu. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha for selu. Default is 1.6732632423543772848170429916717
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
 
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
+            out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
+    """
     if in_dygraph_mode():
-        if inplace:
-            warnings.warn(
-                "Inplace on sigmoid is not allowed and will be discarded in dygraph mode currently."
-            )
-        return core.ops.sigmoid(input)
-
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'sigmoid')
-    helper = LayerHelper("sigmoid", **locals())
-    outputs = helper.create_variable_for_type_inference(input.dtype)
+        return core.ops.selu(x, 'scale', scale, 'alpha', alpha)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
+    helper = LayerHelper('selu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
-        type='sigmoid', inputs={'X': [input]}, outputs={'Out': outputs})
-    return outputs
+        type='selu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale': scale,
+               'alpha': alpha})
+    return out
 
 
-def log_softmax(input, axis=None, dtype=None, name=None):
+def softmax(x, axis=-1, dtype=None, name=None):
     """
-	:alias_main: paddle.nn.functional.log_softmax
-	:alias: paddle.nn.functional.log_softmax,paddle.nn.functional.activation.log_softmax
+    This operator implements the softmax layer. The calculation process is as follows:
+
+    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
 
-    This operator implements the log_softmax layer. The calculation process is as follows:
+    2. Then ``x`` will be logically flattened to a 2-D matrix. The matrix's second
+    dimension(row length) is the same as the dimension :attr:`axis` of ``x``,
+    and the first dimension(column length) is the product of all other dimensions
+    of ``x``. For each row of the matrix, the softmax operator squashes the
+    K-dimensional(K is the width of the matrix, which is also the size of ``x``'s
+    dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional
+    vector of real values in the range [0, 1] that add up to 1.
+
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
+    are performed to restore the two-dimensional matrix to the same dimension as the ``x`` .
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+        softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+
+    Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+
+          Attrs:
+            axis = -1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+
+        Case 2:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+          Attrs:
+            axis = 1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
+                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
+                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
+                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
+                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
 
     Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float32, or float64.
-        axis (int, optional): The index of dimension to perform softmax calculations, it should be in
-            range :math:`[-1, rank-1]`, while :math:`rank` is the rank of input variable. Default: None. 
-            None and -1 means the last dimension.
-        dtype (np.dtype|core.VarDesc.VarType|str): The desired data type of returned tensor. If specified,
-            the input tensor is casted to dtype before the operation is performed. This is useful for
-            preventing data type overflows. Default: None. Supported dtype: float32 or float64
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
- 
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Variable: ``Tensor`` indicates the output of softmax. The data type and shape are the same as ``input``.
+        A Tensor with the same shape and data type (use ``dtype`` if it is
+        specified) as x.
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn.functional as F
-          import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
 
-          data = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                           [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]]).astype('float32')
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = F.log_softmax(data, -1)
-              # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-              #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-              #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-              #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-              #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-              #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+            paddle.disable_static()
+
+            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                        [3.0, 4.0, 5.0, 6.0],
+                        [7.0, 8.0, 8.0, 9.0]],
+                        [[1.0, 2.0, 3.0, 4.0],
+                        [5.0, 6.0, 7.0, 8.0],
+                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = F.softmax(x)
+            out2 = F.softmax(x, dtype='float64')
+            # out1's data type is float32; out2's data type is float64
+            # out1 and out2's value is as follows:
+            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
     """
 
-    axis = -1 if axis is None else axis
-    dtype = convert_np_dtype_to_dtype_(dtype) if dtype is not None else dtype
+    if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    use_cudnn = True if axis is -1 else False
 
     if in_dygraph_mode():
-        outs_cast = input if dtype is None \
-            else core.ops.cast(input, 'in_dtype', input.dtype, 'out_dtype', dtype)
-        outs_softmax = core.ops.softmax(outs_cast, 'axis', axis, 'use_cudnn',
-                                        False)
-        return core.ops.log(outs_softmax)
+        outs_cast = x if dtype is None \
+            else core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return core.ops.softmax(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
 
     if dtype is None:
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], 'log_softmax')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'softmax')
+    else:
+        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'softmax',
+                    'If dtype is not None, it only support float32 or float64.')
 
-    helper = LayerHelper("log_softmax", **locals())
-    outs_cast = input
+    helper = LayerHelper("softmax", **locals())
+    outs_cast = x
     if dtype is not None:
         outs_cast = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type='cast',
-            inputs={'X': input},
+            inputs={'X': x},
             outputs={'Out': outs_cast},
-            attrs={'in_dtype': input.dtype,
+            attrs={'in_dtype': x.dtype,
                    'out_dtype': dtype})
 
     outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
@@ -386,10 +843,277 @@ def log_softmax(input, axis=None, dtype=None, name=None):
         inputs={'X': outs_cast},
         outputs={'Out': outs_softmax},
         attrs={'axis': axis,
-               'use_cudnn': False})
+               'use_cudnn': use_cudnn})
+
+    return outs_softmax
+
+
+def softplus(x, beta=1, threshold=20, name=None):
+    """
+    softplus activation
+
+    .. math::
+
+        softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        beta (float, optional): The value of beta for softplus. Default is 1
+        threshold (float, optional): The value of threshold for softplus. Default is 20
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+    """
+    if in_dygraph_mode():
+        return core.ops.softplus(x, 'beta', beta, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softplus')
+    helper = LayerHelper('softplus', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='softplus',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': beta,
+               'threshold': threshold})
+    return out
+
+
+def softshrink(x, threshold=0.5, name=None):
+    """
+    softshrink activation
+
+    .. math::
+
+        softshrink(x)= \\begin{cases}
+                        x - threshold, \\text{if } x > threshold \\\\
+                        x + threshold, \\text{if } x < -threshold \\\\
+                        0,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
+    """
+    if threshold < 0:
+        raise ValueError(
+            "The threshold must be no less than zero. Received: {}.".format(
+                threshold))
+
+    if in_dygraph_mode():
+        return core.ops.softshrink(x, 'lambda', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softshrink')
+    helper = LayerHelper('softshrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='softshrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'lambda': threshold})
+    return out
+
+
+def softsign(x, name=None):
+    """
+    softsign activation
+
+    .. math::
+
+        softsign(x) = \\frac{x}{1 + |x|}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+    """
+    if in_dygraph_mode():
+        return core.ops.softsign(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'softsign')
+    helper = LayerHelper('softsign', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='softsign', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
+def tanhshrink(x, name=None):
+    """
+    tanhshrink activation
+
+    .. math::
+
+        tanhshrink(x) = x - tanh(x)
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+    """
+    if in_dygraph_mode():
+        return core.ops.tanh_shrink(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'tanhshrink')
+    helper = LayerHelper('tanh_shrink', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='tanh_shrink', inputs={'X': x}, outputs={'Out': out})
+    return out
+
 
-    outs_log = helper.create_variable_for_type_inference(outs_softmax.dtype)
+def log_softmax(x, axis=-1, dtype=None, name=None):
+    """
+    This operator implements the log_softmax layer. The calculation process is
+    as follows:
+
+    .. math::
+
+        Out[i, j] = log(softmax(x)) 
+                  = log(\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+ 
+    Returns:
+        A Tensor with the same shape and data type (use ``dtype`` if it is
+        specified) as x.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                            [3.0, -4.0, 5.0, -6.0],
+                            [-7.0, -8.0, 8.0, 9.0]],
+                            [[1.0, -2.0, -3.0, 4.0],
+                            [-5.0, 6.0, 7.0, -8.0],
+                            [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = F.log_softmax(x)
+            out2 = F.log_softmax(x, dtype='float64')
+            # out1's data type is float32; out2's data type is float64
+            # out1 and out2's value is as follows:
+            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+    """
+
+    if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        if dtype is not None:
+            x = core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return core.ops.log_softmax(x, 'axis', axis)
+
+    if dtype is None:
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'log_softmax')
+    else:
+        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'log_softmax',
+                    'If dtype is not None, it only support float32 or float64.')
+
+    helper = LayerHelper("log_softmax", **locals())
+    out_cast = x
+    if dtype is not None:
+        out_cast = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': out_cast},
+            attrs={'in_dtype': x.dtype,
+                   'out_dtype': dtype})
+
+    out = helper.create_variable_for_type_inference(out_cast.dtype)
     helper.append_op(
-        type='log', inputs={'X': outs_softmax}, outputs={'Out': outs_log})
+        type='log_softmax',
+        inputs={'X': out_cast},
+        outputs={'Out': out},
+        attrs={'axis': axis})
 
-    return outs_log
+    return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fe41cb6e64c34f34add3c0652ab5b30efe958161..cff108ec6a9a8666e0aa51ba0414fd885777f1a7 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -13,23 +13,36 @@
 # limitations under the License.
 
 import warnings
+import paddle
+from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers.tensor import Variable, fill_constant
+from paddle.fluid.layers.tensor import Variable, fill_constant, zeros, concat
 
 # TODO: define the common functions to build a neural network  
-from ...fluid.layers import dropout  #DEFINE_ALIAS
 from ...fluid.layers import label_smooth  #DEFINE_ALIAS
 from ...fluid import one_hot  #DEFINE_ALIAS
-from ...fluid.layers import pad  #DEFINE_ALIAS
 from ...fluid.layers import pad2d  #DEFINE_ALIAS
 from ...fluid.layers import unfold  #DEFINE_ALIAS
 from ...fluid.layers import assign  #DEFINE_ALIAS
+from ...fluid.layers import squeeze  #DEFINE_ALIAS
+from ...fluid.layers import unsqueeze  #DEFINE_ALIAS
+from ...fluid.layers import elementwise_mul  #DEFINE_ALIAS
+from ...tensor import clip
+from ...tensor import sum
+from ...tensor import sqrt
 
 #from ...fluid.layers import fc  #DEFINE_ALIAS
 from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
+from ...fluid.framework import in_dygraph_mode
+from ...fluid import core, dygraph_utils
+from ...fluid import core, layers
+from ...fluid.data_feeder import check_variable_and_dtype
 
 __all__ = [
     'dropout',
+    'dropout2d',
+    'dropout3d',
+    'alpha_dropout',
     #       'embedding',
     #       'fc',
     'label_smooth',
@@ -40,7 +53,9 @@ __all__ = [
     'unfold',
     #       'bilinear_tensor_product',
     'assign',
-    'interpolate'
+    'interpolate',
+    'bilinear',
+    'cosine_similarity',
 ]
 
 
@@ -446,3 +461,708 @@ def interpolate(input,
         outputs={"Out": out},
         attrs=attrs)
     return out
+
+
+def bilinear(x1, x2, weight, bias=None, name=None):
+    """
+
+    This layer performs bilinear on two inputs.
+
+    .. math::
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+      out = out + b
+
+    In this formula:
+     - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
+     - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
+     - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`b`: the learned bias, shape is [1, out_features].
+     - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
+
+    Parameters:
+       x1 (Tensor): the first input tensor, it's data type should be float32, float64.
+       x2 (Tensor): the second input tensor, it's data type should be float32, float64.
+       weight (Parameter): The learnable weights of this layer, shape is [out_features, in1_features, in2_features].
+       bias (Parameter, optional): The learnable bias(Bias) of this layer, shape is [1, out_features]. If it is set to None, no bias will be added to the output units. The default value is None.
+       name (str, optional): The default value is None. Normally there is no need for user
+           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+       Variable: A 2-D Tensor of shape [batch_size, out_features].
+
+    Examples:
+       .. code-block:: python
+
+        import paddle
+        import numpy
+        import paddle.nn.functional as F
+
+        paddle.disable_static()
+        x1 = numpy.random.random((5, 5)).astype('float32')
+        x2 = numpy.random.random((5, 4)).astype('float32')
+        w = numpy.random.random((1000, 5, 4)).astype('float32')
+        b = numpy.random.random((1, 1000)).astype('float32')
+
+        result = F.bilinear(paddle.to_tensor(x1), paddle.to_tensor(x2), paddle.to_tensor(w), paddle.to_tensor(b))           # result shape [5, 1000]
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.bilinear_tensor_product(x1, x2, weight, bias)
+
+    check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
+    check_variable_and_dtype(x2, 'x2', ['float32', 'float64'], 'bilinear')
+
+    inputs = {"X": x1, "Y": x2, "Weight": weight}
+    if bias is not None:
+        inputs["Bias"] = bias
+
+    helper = LayerHelper("bilinear", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x1.dtype)
+
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+
+    return out
+
+
+def dropout(x,
+            p=0.5,
+            axis=None,
+            training=True,
+            mode="upscale_in_train",
+            name=None):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training. The dropout operator randomly sets the
+    outputs of some units to zero, while upscale others according to the given
+    dropout probability.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float | int): Probability of setting units to zero. Default 0.5.
+        axis (int | list): The axis along which the dropout is performed. Default None.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                           1. upscale_in_train(default), upscale the output at training time
+
+                              - train: out = input * mask / ( 1.0 - dropout_prob )
+                              - inference: out = input
+
+                           2. downscale_in_infer, downscale the output at inference
+
+                              - train: out = input * mask
+                              - inference: out = input * (1.0 - dropout_prob)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x` .
+
+    Examples:
+        We use ``p=0.5`` in the following description for simplicity.
+        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
+            Let's see a simple case when x is a 2d tensor with shape 2*3:
+            [[1 2 3]
+             [4 5 6]]
+            we generate mask with the same shape as x, which is 2*3. The value of mask is
+            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
+            [[0 1 0]
+             [1 0 1]]
+            So the output is obtained from elementwise multiply of x and mask:
+            [[0 2 0]
+             [4 0 6]]
+            Using default setting, i.e. ``mode='upscale_in_train'`` ,
+            if in training phase, the final upscale output is:
+            [[0 4 0 ]
+             [8 0 12]]
+            if in test phase, the output is the same as input:
+            [[1 2 3]
+             [4 5 6]]
+            we can also set ``mode='downscale_in_infer'`` , then
+            if in training phase, the final output is:
+            [[0 2 0]
+             [4 0 6]]
+            if in test phase, the scale output is:
+            [[0.5 1.  1.5]
+             [2.  2.5 3. ]]
+
+        2. When ``axis!=None`` , this is useful for dropping whole channels from an image or sequence.
+            Let's see the simple case when x is a 2d tensor with shape 2*3 again:
+            [[1 2 3]
+             [4 5 6]]
+            (1) If ``axis=0`` , this means the dropout is only performed in axis `0` .
+                we generate mask with the shape 2*1. Only in axis `0` the value is randomly selected.
+                For example, we may get such mask:
+                [[1]
+                 [0]]
+                The output is obtained from elementwise multiply of x and mask. Doing that the mask will be
+                broadcast from 2*1 to 2*3:
+                [[1 1 1]
+                 [0 0 0]]
+                and the result after elementwise multiply is:
+                [[1 2 3]
+                 [0 0 0]]
+                then we can do upscale or downscale according to the setting of other arguments.
+            (2) If ``axis=1`` , this means the dropout is only performed in axis `1` .
+                we generate mask with the shape 1*3. Only in axis `1` the value is randomly selected.
+                For example, we may get such mask:
+                [[1 0 1]]
+                Doing elementwise multiply the mask will be broadcast from 1*3 to 2*3:
+                [[1 0 1]
+                 [1 0 1]]
+                and the result after elementwise multiply is:
+                [[1 0 3]
+                 [4 0 6]]
+            (3) What about ``axis=[0, 1]`` ? This means the dropout is performed in all axes of x,
+                which is the same case as default setting ``axis=None`` .
+            (4) You may note that logically `axis=None` means the dropout is performed in none axis of x,
+                We generate mask with the shape 1*1. Whole input is randomly selected or dropped.
+                For example, we may get such mask:
+                [[0]]
+                Doing elementwise multiply the mask will be broadcast from 1*1 to 2*3:
+                [[0 0 0]
+                 [0 0 0]]
+                and the result after elementwise multiply is:
+                [[0 0 0]
+                 [0 0 0]]
+                Actually this is not what we want because all elements may set to zero~
+            When x is a 4d tensor with shape `NCHW`, we can set ``axis=[0,1]`` and the dropout will be performed
+            in channel `N` and `C`, `H` and `W` is tied, i.e.
+            paddle.nn.dropout(x, p, axis=[0,1])
+            Please refer to ``paddle.nn.functional.dropout2d`` for more details.
+            Similarly, when x is a 5d tensor with shape `NCDHW`, we can set ``axis=[0,1]`` to perform
+            dropout3d. Please refer to ``paddle.nn.functional.dropout3d`` for more details.
+
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout(x, 0.5)
+            y_test = paddle.nn.functional.dropout(x, 0.5, training=False) 
+            y_0 = paddle.nn.functional.dropout(x, axis=0)
+            y_1 = paddle.nn.functional.dropout(x, axis=1)
+            y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+            print(y_0.numpy())
+            print(y_1.numpy())
+            print(y_01.numpy())
+
+    """
+    if not isinstance(p, (float, int)):
+        raise TypeError("p argument should be a number")
+    if p < 0 or p > 1:
+        raise ValueError("p argument should between 0 and 1")
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    if axis and not isinstance(axis, (int, list)):
+        raise TypeError("datatype of axis argument should be int or list")
+
+    if axis == None:  # commonly used dropout
+        seed = None
+        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+        def get_attrs(prog, dropout_prob, is_test, seed):
+            if (seed is None or seed == 0) and prog.random_seed != 0:
+                seed = prog.random_seed
+            attrs = {
+                'dropout_prob': dropout_prob,
+                'is_test': is_test,
+                'fix_seed': seed is not None,
+                'seed': seed if seed is not None else 0,
+                'dropout_implementation': mode,
+            }
+            return attrs
+
+        if in_dygraph_mode():
+            if default_main_program().random_seed != 0:
+                seed = default_main_program().random_seed
+            out, mask = core.ops.dropout(
+                x, 'dropout_prob', p, 'is_test', not training, 'fix_seed',
+                seed is not None, 'seed', seed
+                if seed is not None else 0, 'dropout_implementation', mode)
+            return out
+
+        helper = LayerHelper('dropout', **locals())
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'dropout')
+
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        mask = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+        attrs = get_attrs(helper.main_program, p, not training, seed)
+
+        helper.append_op(
+            type='dropout',
+            inputs={'X': [x]},
+            outputs={'Out': [out],
+                     'Mask': [mask]},
+            attrs=attrs)
+        return out
+    else:  #sometimes called dropout_nd #TODO: optimize with c++
+        if not in_dygraph_mode():
+            check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
+        dtype = x.dtype
+        keep_prob = 1 - p
+        if training:
+            if p == 1.:
+                return layers.scale(x, scale=0.)
+
+            scale_input = layers.scale(
+                x, scale=1 / keep_prob) if mode == 'upscale_in_train' else x
+
+            #get mask shape
+            input_shape = x.shape
+            drop_axes = [axis] if isinstance(axis, int) else axis
+            if max(drop_axes) > len(input_shape) - 1:
+                raise ValueError("axis value should less than dimensions of x:{}, but get drop_axes value:{} " \
+                                 .format(len(input_shape), max(drop_axes)))
+            if len(drop_axes) > len(input_shape):
+                raise ValueError(
+                    "length of axis should not greater than dimensions of x:{}, but get length of drop axes: {}".
+                    format(len(input_shape), len(drop_axes)))
+            mask_shape = [1] * len(input_shape)
+            for i in drop_axes:
+                mask_shape[i] = input_shape[i]
+
+            #get mask
+            random_tensor = layers.uniform_random(
+                mask_shape, dtype='float32', min=0., max=1.0)
+            p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+            keep_mask = layers.greater_equal(random_tensor, p)
+
+            scale_input = layers.cast(scale_input, dtype)
+            keep_mask = layers.cast(keep_mask, dtype)
+            ret = paddle.multiply(scale_input, keep_mask, name=name)
+            return ret
+        else:  # test
+            ret = layers.scale(
+                x, scale=keep_prob) if mode == 'downscale_in_infer' else x
+            return ret
+
+
+def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
+    """
+    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
+    a channel is a 2D feature map with the shape `HW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    Args:
+        x (Tensor):  The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C].
+                     The data type is float32 or float64.
+        p (float): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCHW` , `NHWC` . The default is `NCHW` . When it is `NCHW` , the data is
+                                    stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout2d, has same shape and data type as `x` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout2d(x)  #train
+            y_test = paddle.nn.functional.dropout2d(x, training=False) #test
+            for i in range(2):
+                for j in range(3):
+                    print(x.numpy()[i,j,:,:])
+                    print(y_train.numpy()[i,j,:,:]) # may all 0
+                    print(y_test.numpy()[i,j,:,:])
+    """
+    input_shape = x.shape
+    if len(input_shape) != 4:
+        raise ValueError("dimensions of x should be 4, but received {} != 4"\
+        .format(len(input_shape)))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCHW' else [0, 3],
+        training=training,
+        mode="upscale_in_train",
+        name=name)
+
+
+def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
+    """
+    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
+    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    Args:
+        x (Tensor):  The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C].
+                     The data type is float32 or float64.
+        p (float): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    ``NCDHW``, ``NDHWC``. The default is ``NCDHW`` . When it is ``NCDHW`` , the data is
+                                    stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout3d, has same shape and data type with `x` .
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.dropout3d(x)  #train
+            y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+            print(x.numpy()[0,0,:,:,:])
+            print(y_train.numpy()[0,0,:,:,:]) # may all 0
+            print(y_test.numpy()[0,0,:,:,:])
+    """
+
+    input_shape = x.shape
+    if len(input_shape) != 5:
+        raise ValueError("dimensions of x should be 5, but received {} != 5" \
+        .format(len(input_shape)))
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
+        training=training,
+        mode="upscale_in_train",
+        name=name)
+
+
+def alpha_dropout(x, p=0.5, training=True, name=None):
+    """
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing property.
+    For an input with zero mean and unit standard deviation, the output of Alpha Dropout
+    maintains the original mean and standard deviation of the input.
+    Alpha Dropout fits well to SELU activate function by randomly setting activations to the negative saturation value.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float | int): Probability of setting units to zero. Default 0.5.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x`.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
+            x = paddle.to_tensor(x)
+            y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+            y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+            print(x.numpy())
+            print(y_train.numpy())
+            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
+            print(y_test.numpy())
+    """
+    if not isinstance(p, (float, int)):
+        raise TypeError("p argument should be a float or int")
+    if p < 0 or p > 1:
+        raise ValueError("p argument should between 0 and 1")
+
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'alpha_dropout')
+
+    if training:
+        #get transformation params
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+        alpha_p = -alpha * scale
+        a = ((1 - p) * (1 + p * alpha_p**2))**-0.5
+        b = -a * alpha_p * p
+
+        dtype = x.dtype
+        input_shape = x.shape
+
+        #get mask
+        random_tensor = layers.uniform_random(
+            input_shape, dtype='float32', min=0., max=1.0)
+        p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+        keep_mask = layers.greater_equal(random_tensor, p)
+        keep_mask = layers.cast(keep_mask, dtype)
+        drop_mask = layers.elementwise_sub(
+            layers.fill_constant(
+                shape=input_shape, dtype=dtype, value=1.),
+            keep_mask)
+
+        #apply mask
+        b = layers.fill_constant(shape=[1], dtype=dtype, value=b)
+        y = layers.elementwise_add(
+            paddle.multiply(x, keep_mask),
+            layers.scale(
+                drop_mask, scale=alpha_p))
+        res = layers.elementwise_add(layers.scale(y, scale=a), b, name=name)
+        return res
+    else:  # test
+        return x
+
+
+def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
+    """
+    Pad tensor according to 'pad' and 'mode'.
+    If mode is 'reflect', pad[0] and pad[1] must be no greater
+    than width-1. The height and depth dimension has the same condition.
+
+    Parameters:
+        x (Tensor): The input tensor with data type float32/double/int32/int64_t.
+        pad (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. 1. If input dimension is 3, then the pad has the form (pad_left,
+            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, 
+            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form 
+            (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+            
+        mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'.
+            When in 'constant' mode, this op uses a constant value to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'constant'
+        value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
+        data_format (str): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of
+           the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+                    
+    Returns: a Tensor padded according to pad and mode and data type is same as input.
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+
+            Case 0:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'constant'
+                value = 0
+                Out = [[[[[0. 0. 0. 0. 0. 0. 0.]
+                          [0. 0. 1. 2. 3. 0. 0.]
+                          [0. 0. 4. 5. 6. 0. 0.]
+                          [0. 0. 0. 0. 0. 0. 0.]]]]]
+
+            Case 1:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'reflect'
+                Out = [[[[[6. 5. 4. 5. 6. 5. 4.]
+                          [3. 2. 1. 2. 3. 2. 1.]
+                          [6. 5. 4. 5. 6. 5. 4.]
+                          [3. 2. 1. 2. 3. 2. 1.]]]]]
+
+            Case 2:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'replicate'
+                Out = [[[[[1. 1. 1. 2. 3. 3. 3.]
+                          [1. 1. 1. 2. 3. 3. 3.]
+                          [4. 4. 4. 5. 6. 6. 6.]
+                          [4. 4. 4. 5. 6. 6. 6.]]]]]
+
+            Case 3:
+                pad = [2, 2, 1, 1, 0, 0],
+                mode = 'circular'
+                Out = [[[[[5. 6. 4. 5. 6. 4. 5.]
+                          [2. 3. 1. 2. 3. 1. 2.]
+                          [5. 6. 4. 5. 6. 4. 5.]
+                          [2. 3. 1. 2. 3. 1. 2.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            
+            paddle.disable_static()
+            
+            # example 1
+            x_shape = (1, 1, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad(tensor_x, pad=[2, 3], value=1, mode='constant')
+            print(y.numpy())
+            # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
+            
+            # example 2
+            x_shape = (1, 1, 2, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad(tensor_x, pad=[1, 2, 1, 1], value=1, mode='circular')
+            print(y.numpy())
+            # [[[[6. 4. 5. 6. 4. 5.]
+            #    [3. 1. 2. 3. 1. 2.]
+            #    [6. 4. 5. 6. 4. 5.]
+            #    [3. 1. 2. 3. 1. 2.]]]]
+    """
+    assert mode in ['reflect', 'replicate', 'constant', 'circular'], \
+            "mode should be one of constant, reflect, replicate, circular, but got {}.".format(mode)
+
+    data_format = data_format.upper()
+    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], \
+        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
+        "but got {}".format(data_format)
+
+    x_dim = len(x.shape)
+
+    original_data_format = data_format
+    unsqueezed_dim = []
+
+    if isinstance(pad, Variable):
+        if data_format in ["NCL", "NCHW", "NCDHW"]:
+            data_format = "NCDHW"
+            if x_dim == 3:
+                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                unsqueezed_dim = [3, 4]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                unsqueezed_dim = [2]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+        elif data_format in ["NLC", "NHWC", "NDHWC"]:
+            data_format = "NDHWC"
+            if x_dim == 3:
+                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                unsqueezed_dim = [2, 3]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                unsqueezed_dim = [1]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+    else:
+        if data_format in ["NCL", "NCHW", "NCDHW"]:
+            data_format = "NCDHW"
+            if x_dim == 3:
+                pad = [0, 0, 0, 0] + pad
+                unsqueezed_dim = [3, 4]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = pad + [0, 0]
+                unsqueezed_dim = [2]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+        elif data_format in ["NLC", "NHWC", "NDHWC"]:
+            data_format = "NDHWC"
+            if x_dim == 3:
+                pad = [0, 0, 0, 0] + pad
+                unsqueezed_dim = [2, 3]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+            elif x_dim == 4:
+                pad = pad + [0, 0]
+                unsqueezed_dim = [1]
+                x = unsqueeze(x, axes=unsqueezed_dim)
+
+    if in_dygraph_mode():
+        if isinstance(pad, Variable):
+            pad = pad.numpy()
+        out = core.ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
+                             "data_format", data_format, "name", name)
+    else:
+        attrs = {'mode': mode, 'value': value, 'data_format': data_format}
+        inputs = {'X': [x]}
+        if isinstance(pad, Variable):
+            inputs['Paddings'] = [pad]
+            attrs['paddings'] = []
+        else:
+            attrs['paddings'] = pad
+
+        helper = LayerHelper('pad3d', **locals())
+
+        dtype = helper.input_dtype(input_param_name='input')
+        out = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
+
+    if len(unsqueezed_dim) != 0:
+        out = squeeze(out, axes=unsqueezed_dim)
+
+    return out
+
+
+def cosine_similarity(x1, x2, axis=1, eps=1e-8):
+    """
+    Compute cosine similarity between x1 and x2 along axis.
+
+    Parameters:
+        x1 (Tensor): First input. float32/double.
+        x2 (Tensor): Second input. float32/double.
+        axis (int): Dimension of vectors to compute cosine similarity. Default is 1.
+        eps(float): Small value to avoid division by zero. Default is 1e-8.
+                    
+    Returns: a Tensor representing cosine similarity between x1 and x2 along axis.
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: text
+            Case 0:
+                x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
+                     [0.48949873 0.5797396  0.65444374 0.66510963]
+                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
+                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                     [0.4427798  0.54136837 0.5276275  0.32394758]
+                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                axis = 1
+                eps = 1e-8
+                Out: [0.5275037  0.8368967  0.75037485 0.9245899]
+
+    Code Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            np.random.seed(0)
+            x1 = np.random.rand(2,3)
+            x2 = np.random.rand(2,3)
+            x1 = paddle.to_tensor(x1)
+            x2 = paddle.to_tensor(x2)
+            result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
+            print(result.numpy())
+            # [0.99806249 0.9817672  0.94987036]
+            
+    """
+    w12 = sum(elementwise_mul(x1, x2), axis=axis)
+    w1 = sum(elementwise_mul(x1, x1), axis=axis)
+    w2 = sum(elementwise_mul(x2, x2), axis=axis)
+    n12 = sqrt(clip(w1 * w2, min=eps * eps))
+    cos_sim = w12 / n12
+    return cos_sim
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 2a519718258856fe1f4462422a36dccae7066ad1..f80f200c7163836252faa4b1c932178f6bab0dff 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -13,15 +13,24 @@
 # limitations under the License.
 from __future__ import print_function
 
-__all__ = ['conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose']
+__all__ = [
+    'conv1d',
+    'conv_transpose1d',
+    'conv2d',
+    'conv_transpose2d',
+    'conv3d',
+    'conv_transpose3d',
+]
 
 import numpy as np
+from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
 from ...fluid import core, dygraph_utils
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
+from .common import pad2d
 
 
 def _is_list_or_tuple(input):
@@ -87,20 +96,242 @@ def _update_padding_nd(padding, channel_last, num_dims):
     return padding, padding_algorithm
 
 
-def conv2d(input,
+def conv1d(x,
            weight,
            bias=None,
+           stride=1,
            padding=0,
+           dilation=1,
+           groups=1,
+           data_format='NCL',
+           name=None):
+    """
+    The convolution1D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCL format, where N is batch size, C is the number of
+    channels, L is the length of the feature.
+    Filter is in MCK format, where M is the number of output image channels,
+    C is the number of input image channels, K is the size of the kernel.
+    If the groups is greater than 1, C will equal the number of input image
+    channels divided by the groups. If bias attribution and activation type
+    are provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCL format.
+    * :math:`W`: Kernel value, a tensor with MCK format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+
+    Args:
+        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type 
+            of input is float16 or float32 or float64.
+        weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
+            the number of output channels, g is the number of groups, K is the kernel's size. 
+        bias (Tensor, optional): The bias with shape [M,]. Default: None.
+        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+            contain one integers, (stride_size). Default: 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
+            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
+            4. a list[int] or tuple[int] whose length is 2. It has the form  [pad_before, pad_after].
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain one integer, (dilation_size). Default: 1.
+        groups (int, optional): The groups number of the conv1d function. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
+            The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, feature_length]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+
+    Returns:
+        A tensor representing the conv1d, whose data type is the 
+        same with input.
+
+    Raises:
+        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If `data_format` is not "NCL" or "NLC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ShapeError: If the input is not 3-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
+        ShapeError: If the number of input channels is not equal to filter's channels * groups.
+        ShapeError: If the number of output channels is not be divided by groups.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          x = np.array([[[4, 8, 1, 9],
+            [7, 2, 0, 9],
+            [6, 9, 2, 6]]]).astype(np.float32)
+          w=np.array(
+          [[[9, 3, 4],
+            [0, 0, 7],
+            [2, 5, 6]],
+           [[0, 3, 4],
+            [2, 9, 7],
+            [5, 6, 8]]]).astype(np.float32)
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv1d(x_var, w_var)
+          y_np = y_var.numpy()
+          print(y_np)
+          
+          # [[[133. 238.]
+          #   [160. 211.]]]
+    """
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if data_format not in ["NCL", "NLC"]:
+        raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+
+    channel_last = (data_format == "NHWC")
+    channel_dim = -1 if channel_last else 1
+    conv2d_data_format = "NHWC" if channel_last else "NCHW"
+    num_channels = x.shape[channel_dim]
+    num_filters = weight.shape[0]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             x.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, x.shape, groups))
+    if num_filters % groups != 0:
+        raise ValueError(
+            "the number of filters must be divisible by groups,"
+            "received: the number of filters is {}, the shape of weight is {}"
+            ", the groups is {}".format(num_filters, weight.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+    if len(padding) == 2:
+        padding = padding + [0] * 2
+    elif len(padding) == 1:
+        padding = padding + [0]
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            format(padding))
+
+    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
+    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+
+    l_type = "conv2d"
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+        use_cudnn = False
+
+    inputs = {'Input': [x], 'Filter': [weight]}
+    attrs = {
+        'strides': stride,
+        'paddings': padding,
+        'dilations': dilation,
+        'groups': groups,
+        'use_cudnn': use_cudnn,
+        'use_mkldnn': False,
+        'fuse_relu_before_depthwise_conv': False,
+        "padding_algorithm": padding_algorithm,
+        "data_format": conv2d_data_format
+    }
+    squeeze_aixs = -2 if channel_last else -1
+    x = nn.unsqueeze(input=x, axes=[squeeze_aixs])
+    weight = nn.unsqueeze(input=weight, axes=[-1])
+    if in_dygraph_mode():
+        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
+                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
+                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
+                 padding_algorithm, "data_format", conv2d_data_format)
+        out = getattr(core.ops, l_type)(x, weight, *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    else:
+        inputs = {'Input': [x], 'Filter': [weight]}
+        attrs = {
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": conv2d_data_format
+        }
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'conv2d')
+        helper = LayerHelper(l_type, **locals())
+        dtype = helper.input_dtype()
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [out]}
+        helper.append_op(
+            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    out = nn.squeeze(input=out, axes=[squeeze_aixs])
+    return out
+
+
+def conv2d(x,
+           weight,
+           bias=None,
            stride=1,
+           padding=0,
            dilation=1,
            groups=1,
-           use_cudnn=True,
-           act=None,
            data_format="NCHW",
            name=None):
     """
-	:alias_main: paddle.nn.functional.conv2d
-	:alias: paddle.nn.functional.conv2d,paddle.nn.functional.conv.conv2d
 
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -152,12 +383,15 @@ def conv2d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type 
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
             of input is float16 or float32 or float64.
-        weight (Variable): The convolution kernel with shape [M, C/g, kH, kW], where M is
+        weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width. 
-        bias (Variable, optional): The bias with shape [M,].
+        bias (Tensor, optional): The bias with shape [M,].
+        stride (int|tuple): The stride size. It means the stride in convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
@@ -168,9 +402,6 @@ def conv2d(input,
             when `data_format` is `"NHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
             points. If dilation is a tuple, it must contain two integers, (dilation_height, 
             dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
@@ -180,10 +411,6 @@ def conv2d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -193,13 +420,9 @@ def conv2d(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv2d, whose data type is the 
-        same with input. If act is None, the tensor variable storing the convolution 
-        result, and if act is not None, the tensor variable storing convolution 
-        and non-linearity activation result.
+        A Tensor representing the conv2d result, whose data type is the same with input. 
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If the channel dimmention of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -214,62 +437,65 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          from paddle import fluid
+          import paddle
           import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
           import numpy as np
 
           x = np.random.randn(2, 3, 8, 8).astype(np.float32)
           w = np.random.randn(6, 3, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv2d(x_var, w_var)
+          y_np = y_var.numpy()
+
           print(y_np.shape)
 
           # (2, 6, 6, 6)
     """
     # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
     channel_last = (data_format == "NHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError("The channel dimmention of the input({}) "
                          "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
+                             x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups))
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
             ", the groups is {}".format(num_filters, weight.shape, groups))
 
+    # use_cudnn = True if core.is_compiled_with_cuda() else False
+    cudnn_version = get_cudnn_version()
+
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
+
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        use_cudnn = False
 
-    inputs = {'Input': [input], 'Filter': [weight]}
+    inputs = {'Input': [x], 'Filter': [weight]}
     attrs = {
         'strides': stride,
         'paddings': padding,
@@ -287,15 +513,13 @@ def conv2d(input,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
                  padding_algorithm, "data_format", data_format)
-        pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
+        pre_bias = getattr(core.ops, l_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
             'strides': stride,
             'paddings': padding,
@@ -307,8 +531,8 @@ def conv2d(input,
             "padding_algorithm": padding_algorithm,
             "data_format": data_format
         }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv2d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv2d')
         helper = LayerHelper(l_type, **locals())
         dtype = helper.input_dtype()
         pre_bias = helper.create_variable_for_type_inference(dtype)
@@ -316,28 +540,279 @@ def conv2d(input,
         helper.append_op(
             type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
+
     return out
 
 
-def conv2d_transpose(input,
+def conv_transpose1d(x,
                      weight,
                      bias=None,
-                     output_size=None,
-                     padding=0,
                      stride=1,
+                     padding=0,
+                     output_padding=0,
+                     groups=1,
                      dilation=1,
+                     output_size=None,
+                     data_format="NCL",
+                     name=None):
+    """
+    The 1-D convolution transpose layer calculates the output based on the input,
+    filter, and dilation, stride, padding. Input(Input) and output(Output)
+    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
+    L is the length of the feature. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a 3-D Tensor with 'NCL' format or 'NLC' format.
+    * :math:`W`: Filter value, a 3-D Tensor with 'MCK' format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 3-D Tensor with data format 'NCL' or 'NLC', the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+           L^\prime_{out} &= (L_{in} - 1) * stride - pad_top - pad_bottom + dilation * (L_f - 1) + 1 + output_padding \\\\
+           L_{out} &\in [ L^\prime_{out}, L^\prime_{out} + stride ]
+
+    Note:
+          The conv1d_transpose can be seen as the backward of the conv1d. For conv1d,
+          when stride > 1, conv1d maps multiple input shape to the same output shape,
+          so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`L_{out} = L^\prime_{out}`;
+          else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
+          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+
+    Args:
+        x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
+                         its data type is float32 or float64.
+        weight(Tensor): The convolution kernel, a Tensor with shape [C, M/g, K],
+            where M is the number of output channels(filters), g is the number of groups,
+            K is the size of the kernel.
+        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
+        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain one integer, `(stride_size)`.
+            Default: stride = 1.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in two forms:
+             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
+        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
+             If it is a tuple, it must contain one integer. Default: 0.
+        groups(int, optional): The groups number of the conv1d transpose function. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a tuple, it must contain one integer, `(dilation_size)`.
+            Default: dilation = 1.
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain one integer, `(feature_length)`. None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None. output_size and filter_size
+            should not be None at the same time.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
+            The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer 
+           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           None by default.
+
+    Returns:
+        A  tensor representing the result of 1-D transpose convolution, whose
+        data type is the same with input. And its shape is (num_batches, channels, length)
+        when data_format is `"NCL"` and (num_batches, length, channels) when data_format is
+        `"NLC"`.
+
+    Raises:
+        ValueError: If `data_format` is a string, but not "NCL" or "NLC".
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+            or the element corresponding to the input's channel is not 0.
+        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_padding` is greater than `stride`.
+        ShapeError: If the input is not 3-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
+        ShapeError: If the number of input channels is not equal to filter's channels.
+        ShapeError: If the size of `output_size` is not equal to that of `stride`.
+
+    Examples:
+        .. code-block:: python
+
+
+
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          
+          paddle.disable_static()
+          # shape: (1, 2, 4)
+          x=np.array([[[4, 0, 9, 7],
+                       [8, 0, 9, 2,]]]).astype(np.float32)
+          # shape: (2, 1, 2)
+          y=np.array([[[7, 0]],
+                      [[4, 2]]]).astype(np.float32)
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv_transpose1d(x_var, w_var)
+          y_np = y_var.numpy()
+          print y_np
+          
+          # [[[60. 16. 99. 75.  4.]]]
+    """
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if data_format not in ['NCL', 'NLC']:
+        raise ValueError(
+            "Attr(data_format) of conv2d_transpose got wrong value: "
+            "received {}, but only 'NCL' or 'NLC' are supported.".format(
+                data_format))
+    channel_last = (data_format == "NLC")
+    channel_dim = -1 if channel_last else 1
+
+    num_channels = x.shape[channel_dim]
+    if num_channels < 0:
+        raise ValueError("The channel dimmention of the input({}) "
+                         "should be defined. Received: {}.".format(
+                             x.shape, num_channels))
+    if num_channels % groups != 0:
+        raise ValueError(
+            "the channel of input must be divisible by groups,"
+            "received: the channel of input is {}, the shape of input is {}"
+            ", the groups is {}".format(num_channels, x.shape, groups))
+
+    # update attrs
+    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+
+    if len(padding) == 2:
+        padding = padding + [0] * 2
+    elif len(padding) == 1:
+        padding = padding + [0]
+    else:
+        raise ValueError(
+            "The size of padding's dimmention should 1 or 2. But got padding={}".
+            format(padding))
+
+    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
+    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+    output_padding = utils.convert_to_list(output_padding, 1,
+                                           'output_padding') + [0]
+    if output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
+
+    if output_size is None:
+        output_size = []
+    elif isinstance(output_size, (list, tuple, int)):
+        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
+    else:
+        raise ValueError("output_size should be int, or list, tuple of ints")
+
+    op_type = 'conv2d_transpose'
+    num_filters = weight.shape[1]
+    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+        use_cudnn = False
+
+    squeeze_axis = -2 if channel_last else -1
+    conv2d_data_format = "NHWC" if channel_last else "NCHW"
+
+    x = nn.unsqueeze(input=x, axes=[squeeze_axis])
+    weight = nn.unsqueeze(input=weight, axes=[-1])
+
+    if in_dygraph_mode():
+        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
+                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
+                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
+                 'data_format', conv2d_data_format)
+        out = getattr(core.ops, op_type)(x, weight, *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+    else:
+        inputs = {'Input': [x], 'Filter': [weight]}
+        attrs = {
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': conv2d_data_format
+        }
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'conv2d_transpose')
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype()
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Output": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        if bias is not None:
+            out = nn.elementwise_add(out, bias, axis=channel_dim)
+
+    if output_size is None:
+        out = pad2d(
+            out,
+            padding=[0, output_padding, 0, 0],
+            data_format=conv2d_data_format,
+            name=name)
+    out = nn.squeeze(input=out, axes=[squeeze_axis])
+    return out
+
+
+def conv_transpose2d(x,
+                     weight,
+                     bias=None,
+                     stride=1,
+                     padding=0,
+                     output_padding=0,
                      groups=1,
-                     use_cudnn=True,
-                     act=None,
+                     dilation=1,
                      data_format='NCHW',
+                     output_size=None,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.conv2d_transpose
-	:alias: paddle.nn.functional.conv2d_transpose,paddle.nn.functional.conv.conv2d_transpose
 
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -350,6 +825,7 @@ def conv2d_transpose(input,
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
+    See more detail in :ref:`api_nn_conv_ConvTranspose2d` .
 
     For each input :math:`X`, the equation is:
 
@@ -398,18 +874,15 @@ def conv2d_transpose(input,
           conv2d_transpose can compute the kernel size automatically.
 
     Args:
-        input(Variable): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
+        x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
             whose data type is float32 or float64.
-        weight(Variable): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
+        weight(Tensor): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
             where M is the number of output channels(filters), g is the number of groups,
             kH is the height of the kernel, and kW is the width of the kernel.
-        bias(Variable, optional): The bias, a Tensor with shape [M, ].
-        output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
              string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
@@ -421,10 +894,9 @@ def conv2d_transpose(input,
             when `data_format` is `'NHWC'`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
@@ -433,10 +905,12 @@ def conv2d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
+        output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_height, image_width). None if use
+            filter_size, padding, and stride to calculate output_size.
+            If output_size is specified, output_size and filter_size (weight)'s shape 
+            should follow the formula above. Default: None. output_size and filter_size 
+            should not be None at the same time.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -446,20 +920,17 @@ def conv2d_transpose(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv2d_transpose, whose 
+        A Tensor representing the conv_transpose2d, whose 
         data type is the same with input and shape is (num_batches, channels, out_h, 
-        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable 
-        storing the transposed convolution result, and if act is not None, the 
-        tensor variable storing transposed convolution and non-linearity activation 
-        result.
+        out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing 
+        transposed convolution result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 4-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
         ShapeError: If the dimension size of input minus the size of `stride` is not 2.
@@ -469,28 +940,23 @@ def conv2d_transpose(input,
     Examples:
         .. code-block:: python
 
-          from paddle import fluid
-          import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
           import numpy as np
+          import paddle
+          import paddle.nn.functional as F
 
           x = np.random.randn(2, 3, 8, 8).astype(np.float32)
           w = np.random.randn(3, 6, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv2d_transpose(x_var, w_var)
+          y_np = y_var.numpy()
           print(y_np.shape)
 
           # (2, 6, 10, 10)
     """
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
@@ -498,48 +964,65 @@ def conv2d_transpose(input,
                 data_format))
     channel_last = (data_format == "NHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError("The channel dimmention of the input({}) "
                          "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
+                             x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups))
+
+    cudnn_version = get_cudnn_version()
+
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
+
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 2, 'output_size')
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 2,
+                                               'output_padding')
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+    if (num_channels == groups and num_filters == 1):
         op_type = 'depthwise_conv2d_transpose'
+        use_cudnn = False
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', data_format)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -549,37 +1032,32 @@ def conv2d_transpose(input,
             'use_cudnn': use_cudnn,
             'data_format': data_format
         }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'],
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'conv2d_transpose')
         helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        pre_bias = helper.create_variable_for_type_inference(dtype)
+        pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
+
     return out
 
 
-def conv3d(input,
+def conv3d(x,
            weight,
            bias=None,
-           padding=0,
            stride=1,
+           padding=0,
            dilation=1,
            groups=1,
-           use_cudnn=True,
-           act=None,
            data_format="NCDHW",
            name=None):
     """
-	:alias_main: paddle.nn.functional.conv3d
-	:alias: paddle.nn.functional.conv3d,paddle.nn.functional.conv.conv3d
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -625,12 +1103,15 @@ def conv3d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
+        x (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
             type of input is float16 or float32 or float64.
         weight (Variable): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
+        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
+        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
+            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
@@ -641,9 +1122,6 @@ def conv3d(input,
             when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
-            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
@@ -653,10 +1131,6 @@ def conv3d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -666,13 +1140,12 @@ def conv3d(input,
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv3d, whose data type is 
+        A Tensor representing the conv3d, whose data type is 
         the same with input. If act is None, the tensor variable storing the 
         convolution result, and if act is not None, the tensor variable storing 
         convolution and non-linearity activation result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If the channel dimmention of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -706,10 +1179,6 @@ def conv3d(input,
             # (2, 6, 6, 6, 6)
     """
     # entry check
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): {}. ".format(use_cudnn))
-
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
@@ -717,12 +1186,12 @@ def conv3d(input,
 
     channel_last = (data_format == "NDHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
@@ -734,6 +1203,10 @@ def conv3d(input,
             "Received: number of filters({}), groups({}).".format(num_filters,
                                                                   groups))
 
+    cudnn_version = get_cudnn_version()
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
+
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = utils.convert_to_list(stride, 3, 'stride')
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
@@ -744,15 +1217,13 @@ def conv3d(input,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  "padding_algorithm", padding_algorithm, "data_format",
                  data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
             'strides': stride,
             'paddings': padding,
@@ -765,8 +1236,8 @@ def conv3d(input,
         }
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv3d')
 
         pre_bias = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [pre_bias]}
@@ -774,31 +1245,26 @@ def conv3d(input,
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
 
     return out
 
 
-def conv3d_transpose(input,
+def conv_transpose3d(x,
                      weight,
                      bias=None,
-                     output_size=None,
-                     padding=0,
                      stride=1,
-                     dilation=1,
+                     padding=0,
+                     output_padding=0,
                      groups=1,
-                     use_cudnn=True,
-                     act=None,
+                     dilation=1,
                      data_format='NCDHW',
+                     output_size=None,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.conv3d_transpose
-	:alias: paddle.nn.functional.conv3d_transpose,paddle.nn.functional.conv.conv3d_transpose
-
-    The convolution3D transpose layer calculates the output based on the input,
+    The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
     D is the depth of the feature, H is the height of the feature, and W
@@ -809,6 +1275,7 @@ def conv3d_transpose(input,
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
+    See more detail in :ref:`api_nn_conv_ConvTranspose3d` .
 
     For each input :math:`X`, the equation is:
 
@@ -861,17 +1328,16 @@ def conv3d_transpose(input,
           conv3d_transpose can compute the kernel size automatically.
 
     Args:
-        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
+        x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
             of input is float32 or float64.
-        weight (Variable): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
+        weight (Tensor): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
-        output_size(int|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
              adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
              either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
@@ -882,11 +1348,9 @@ def conv3d_transpose(input,
             when `data_format` is `'NDHWC'`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
@@ -896,32 +1360,32 @@ def conv3d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
+        output_size(int|list|tuple, optional): The output image size. If output size is a
+            tuple, it must contain three integers, (image_depth, image_height, image_width). This
+            parameter only works when filter_size is None. If output_size and filter_size are 
+            specified at the same time, They should follow the formula above. Default: None. 
+            Output_size and filter_size should not be None at the same time.
         name(str, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
 
     Returns:
-        A Variable holding Tensor representing the conv3d_transpose, whose data 
+        A Tensor representing the conv_transpose3d, whose data 
         type is the same with input and shape is (num_batches, channels, out_d, out_h, 
         out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
         variable storing the transposed convolution result, and if act is not None, the tensor 
         variable storing transposed convolution and non-linearity activation result.
 
     Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
+        ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 5-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
         ShapeError: If the dimension size of input minus the size of `stride` is not 2.
@@ -930,29 +1394,26 @@ def conv3d_transpose(input,
 
     Examples:
        .. code-block:: python
+          
+          import numpy as np
 
-          from paddle import fluid
+          import paddle
           import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
-          import numpy as np
 
           x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
           w = np.random.randn(3, 6, 3, 3, 3).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv3d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
+          paddle.disable_static()
+
+          x_var = paddle.to_tensor(x)
+          w_var = paddle.to_tensor(w)
+          y_var = F.conv_transpose3d(x_var, w_var)
+          y_np = y_var.numpy()
           print(y_np.shape)
 
           # (2, 6, 10, 10, 10)
     """
     # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
@@ -960,12 +1421,12 @@ def conv3d_transpose(input,
 
     channel_last = (data_format == "NDHWC")
     channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
+    num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
@@ -977,29 +1438,45 @@ def conv3d_transpose(input,
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 3, 'output_size')
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 3,
+                                               'output_padding')
+
+    cudnn_version = get_cudnn_version()
+
+    #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
+    use_cudnn = True if (core.is_compiled_with_cuda() and
+                         cudnn_version is not None) else False
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'paddings', padding,
-                 "padding_algorithm", padding_algorithm, 'strides', stride,
-                 'dilations', dilation, 'groups', groups, 'use_cudnn',
-                 use_cudnn, "data_format", data_format_)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'paddings', padding, "padding_algorithm", padding_algorithm,
+                 'strides', stride, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, "data_format", data_format_)
+        pre_bias = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
+            out = pre_bias
     else:
-        inputs = {'Input': [input], 'Filter': [weight]}
+        inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'paddings': padding,
             "padding_algorithm": padding_algorithm,
@@ -1010,19 +1487,17 @@ def conv3d_transpose(input,
             "data_format": data_format_
         }
         helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'conv3d')
 
-        pre_bias = helper.create_variable_for_type_inference(dtype)
+        pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
 
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
+            out = pre_bias
 
     return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77bf0e39672984f7076938b134f3e54f4c761ab
--- /dev/null
+++ b/python/paddle/nn/functional/input.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import warnings
+from ...fluid.framework import Variable, in_dygraph_mode
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.layers import core
+from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+
+__all__ = ['one_hot']
+
+
+def one_hot(x, num_classes, name=None):
+    """
+
+    The operator converts each id in the input 'x' to an one-hot vector with a
+    num_classes length. The value in the vector dimension corresponding to the id
+    is 1, and the value in the remaining dimension is 0.
+
+    The shape of output Tensor is generated by appending num_classes dimension
+    behind the last dimension of the 'x' shape.
+
+    .. code-block:: text
+
+        Example 1:
+
+        input:
+            x.shape = [4]
+            x.data = [1, 1, 3, 0]
+            num_classes = 4
+
+        output:
+            Out.shape = [4, 4]
+            Out.data = [[0., 1., 0., 0.],
+                        [0., 1., 0., 0.],
+                        [0., 0., 0., 1.],
+                        [1., 0., 0., 0.]]
+
+        Example 2:
+
+        input:
+            x.shape = [4]
+            x.data = [1, 1, 5, 0]
+            num_classes = 4
+
+        output: Throw an exception for Illegal value
+            The second dimension in X is 5, which is greater than num_classes,
+            so it throws an exception.
+
+
+    Args:
+        x(Tensor): Tensor with shape :math:`[N_1, N_2, ..., N_k]` ,
+            which contains at least one dimension. The data type is int32 or int64.
+        num_classes(int): An integer defining the num_classes of the one hot dimension. If input 'x'
+            is word id, num_classes is generally the dictionary size.
+
+    Returns:
+        Tensor: The one-hot representations of 'x'. A Tensor with type float32.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            # Correspond to the first example above, where label.shape is 4 and one_hot_label.shape is [4, 4].
+            label = paddle.data(name="label", shape=[4, 1], dtype="int64")
+            # label.shape = [4]
+            # label.data = [1, 1, 3, 0]
+            one_hot_label = paddle.nn.functional.one_hot(x=label, num_classes=4)
+            # one_hot_label.shape = [4, 4]
+            # one_hot_label.data = [[0., 1., 0., 0.],
+            #                       [0., 1., 0., 0.],
+            #                       [0., 0., 0., 1.],
+            #                       [1., 0., 0., 0.]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.one_hot_v2(x, 'depth', num_classes,
+                                   'allow_out_of_range', False)
+    else:
+        check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'one_hot_v2')
+        helper = LayerHelper("one_hot_v2", **locals())
+
+        one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
+        if not isinstance(num_classes, Variable):
+            # user attribute 
+            inputs = {'X': x}
+            attrs = {'depth': num_classes, 'allow_out_of_range': False}
+        else:
+            num_classes.stop_gradient = True
+            inputs = {'X': x, 'depth_tensor': num_classes}
+            attrs = {'allow_out_of_range': False}
+        helper.append_op(
+            type="one_hot_v2",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={'Out': one_hot_out},
+            stop_gradient=True)
+        return one_hot_out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index bc6d26370f0254b9349ff1bb871a1840ba26293d..55bb36d136405385a88b991576c2a9091437d456 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -12,17 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define loss functions of neural network  
+import paddle
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+import paddle.fluid as fluid
+
+# TODO: define loss functions of neural network
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from ...fluid.framework import core, in_dygraph_mode
+from ...fluid.layers.nn import _elementwise_op_in_dygraph
 from ...fluid.layers import bpr_loss  #DEFINE_ALIAS
 from ...fluid.layers import center_loss  #DEFINE_ALIAS
-from ...fluid.layers import cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import dice_loss  #DEFINE_ALIAS
 from ...fluid.layers import iou_similarity  #DEFINE_ALIAS
-from ...fluid.layers import kldiv_loss  #DEFINE_ALIAS
 from ...fluid.layers import log_loss  #DEFINE_ALIAS
-from ...fluid.layers import mse_loss  #DEFINE_ALIAS
 from ...fluid.layers import npair_loss  #DEFINE_ALIAS
 from ...fluid.layers import rank_loss  #DEFINE_ALIAS
+from ...fluid.layers import reshape
 from ...fluid.layers import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid_focal_loss  #DEFINE_ALIAS
 from ...fluid.layers import smooth_l1  #DEFINE_ALIAS
@@ -33,10 +41,15 @@ from ...fluid.layers import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
 from ...fluid.layers import huber_loss  #DEFINE_ALIAS
-from ...fluid.layers import margin_rank_loss  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.framework import in_dygraph_mode
+from ...fluid.framework import _varbase_creator
+from ...fluid.framework import Variable
 
 __all__ = [
+    'binary_cross_entropy',
+    'binary_cross_entropy_with_logits',
     'bpr_loss',
     'center_loss',
     'cross_entropy',
@@ -44,19 +57,1101 @@ __all__ = [
     'edit_distance',
     'huber_loss',
     'iou_similarity',
-    'kldiv_loss',
+    'kl_div',
+    'l1_loss',
     'log_loss',
-    'margin_rank_loss',
     'mse_loss',
+    'margin_ranking_loss',
     #       'nce',
+    'nll_loss',
     'npair_loss',
     'rank_loss',
     'sampled_softmax_with_cross_entropy',
     'sigmoid_cross_entropy_with_logits',
     'sigmoid_focal_loss',
     'smooth_l1',
+    'smooth_l1_loss',
     'softmax_with_cross_entropy',
     'square_error_cost',
     'ssd_loss',
-    'teacher_student_sigmoid_loss'
+    'teacher_student_sigmoid_loss',
+    'ctc_loss',
 ]
+
+
+def binary_cross_entropy(input, label, weight=None, reduction='mean',
+                         name=None):
+    """
+    This op measures the binary_cross_entropy loss between input predictions ``input``
+    and target labels ``label`` . The binary_cross_entropy loss can be described as:
+
+    If :attr:`weight` is set, the loss is:
+
+    .. math::
+        Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
+
+    If :attr:`weight` is None, the loss is:
+
+    .. math::
+        Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
+
+    If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(Out)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(Out)
+
+    Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
+    should be numbers between 0 and 1.
+
+    Parameters:
+        input (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``input``
+            should always be the output of sigmod.  Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``input``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, has to be a Tensor of size nbatch and the data type
+            is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+
+    Returns:
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``input`` , else the shape of output is scalar.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
+            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
+
+            paddle.disable_static()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = paddle.nn.functional.binary_cross_entropy(input, label)
+            print(output.numpy())  # [0.65537095]
+            paddle.enable_static()
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in binary_cross_entropy should be 'sum', "
+            "'mean' or 'none', but received %s, which is not allowed." %
+            reduction)
+
+    if in_dygraph_mode():
+        out = core.ops.bce_loss(input, label)
+        if weight is not None:
+            out = core.ops.elementwise_mul(out, weight, 'axis', -1)
+
+        if reduction == 'sum':
+            return core.ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
+                                       "reduce_all", True)
+        elif reduction == 'mean':
+            return core.ops.reduce_mean(out, 'dim', [0], 'keep_dim', False,
+                                        "reduce_all", True)
+        else:
+            return out
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'binary_cross_entropy')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
+
+    sub_name = name if weight is None and reduction is 'none' else None
+    helper = LayerHelper("binary_cross_entropy", name=sub_name)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='bce_loss',
+        inputs={
+            'X': [input],
+            'Label': [label],
+        },
+        outputs={'Out': [out]})
+
+    if weight is not None:
+        if isinstance(weight, paddle.framework.Variable):
+            weight_name = name if reduction is 'none' else None
+            out = paddle.multiply(out, weight, axis=-1, name=weight_name)
+        else:
+            raise ValueError(
+                "The weight is not a Tensor, please convert to Tensor.")
+
+    if reduction == 'sum':
+        return paddle.sum(out, name=name)
+    elif reduction == 'mean':
+        return paddle.mean(out, name=name)
+    else:
+        return out
+
+
+def binary_cross_entropy_with_logits(logit,
+                                     label,
+                                     weight=None,
+                                     reduction='mean',
+                                     pos_weight=None,
+                                     name=None):
+    """
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+
+    First this operator calculate loss function as follows:
+
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+
+    Args:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
+            with length equal to the number of classes. The data type is float32, float64.
+            Default is ``'None'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``logit`` , else the shape of output is scalar.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
+            print(output.numpy())  # [0.45618808]
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in binary_cross_entropy_with_logits "
+            "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
+            % reduction)
+
+    if in_dygraph_mode():
+        one = _varbase_creator(dtype=logit.dtype)
+        core.ops.fill_constant(one, 'value',
+                               float(1.0), 'force_cpu', False, 'dtype',
+                               one.dtype, 'str_value', '1.0', 'shape', [1])
+        out = core.ops.sigmoid_cross_entropy_with_logits(logit, label)
+        if pos_weight is not None:
+            log_weight = core.ops.elementwise_add(
+                core.ops.elementwise_mul(
+                    label, core.ops.elementwise_sub(pos_weight, one)), one)
+            out = core.ops.elementwise_mul(out, log_weight)
+        if weight is not None:
+            out = core.ops.elementwise_mul(out, weight)
+
+        if reduction == "sum":
+            return core.ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == "mean":
+            return core.ops.mean(out)
+        else:
+            return out
+
+    fluid.data_feeder.check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'],
+        'binary_cross_entropy_with_logits')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'],
+        'binary_cross_entropy_with_logits')
+    sigmoid_name = None
+    if reduction == 'none' and pos_weight is None and weight is None:
+        sigmoid_name = name
+
+    out = paddle.nn.functional.sigmoid_cross_entropy_with_logits(
+        logit, label, name=sigmoid_name)
+
+    one = paddle.fill_constant(shape=[1], value=1.0, dtype=logit.dtype)
+    if pos_weight is not None:
+        fluid.data_feeder.check_variable_and_dtype(
+            pos_weight, 'pos_weight', ['float32', 'float64'],
+            'binary_cross_entropy_with_logits')
+        log_weight = paddle.add(
+            paddle.multiply(label, paddle.elementwise_sub(pos_weight, one)),
+            one)
+        pos_weight_name = name if reduction == 'none' and weight is None else None
+        out = paddle.multiply(out, log_weight, name=pos_weight_name)
+
+    if weight is not None:
+        fluid.data_feeder.check_variable_and_dtype(
+            weight, 'weight', ['float32', 'float64'],
+            'binary_cross_entropy_with_logits')
+        weight_name = name if reduction == 'none' else None
+        out = paddle.multiply(out, weight, name=weight_name)
+
+    if reduction == "sum":
+        return paddle.sum(out, name=name)
+    elif reduction == "mean":
+        return paddle.mean(out, name=name)
+    return out
+
+
+def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
+    """
+    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
+    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
+    In some cases it can prevent exploding gradients and it is more robust and less
+    sensitivity to outliers. Also known as the Huber loss:
+
+    .. math::
+
+         loss(x,y)=\\frac{1}{n}\\sum_{i}z_i
+
+
+    where z_i is given by:
+
+    .. math::
+
+         \\mathop{z_i}=\\left\\{\\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
+        \\end{array} \\right.
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label
+            is the same as the shape of input.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the errors need to be to use L1. Errors
+            smaller than delta are minimized with L2. Parameter is ignored for
+            negative/zero values. Default = 1.0
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        The tensor variable storing the smooth_l1_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.random.rand(3,3).astype("float32")
+            label_data = np.random.rand(3,3).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = paddle.nn.functioanl.smooth_l1_loss(input, label)
+            print(output.numpy())
+    """
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'smooth_l1_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'smooth_l1_loss')
+
+    out = huber_loss(input=input, label=label, delta=delta)
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in smooth_l1_loss should be 'sum', 'mean' or"
+            " 'none', but received %s, which is not allowed." % reduction)
+    if reduction == 'none':
+        return out
+    elif reduction == 'mean':
+        return fluid.layers.reduce_mean(out)
+    elif reduction == 'sum':
+        return fluid.layers.reduce_sum(out)
+
+
+def margin_ranking_loss(input,
+                        other,
+                        label,
+                        margin=0.0,
+                        reduction='mean',
+                        name=None):
+    """
+
+    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
+
+    .. math::
+        margin\_rank\_loss = max(0, -label * (input - other) + margin)
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'none'``, just return the origin ``margin_rank_loss``.
+
+    Parameters:
+        input(Tensor): the first input tensor, it's data type should be float32, float64.
+        other(Tensor): the second input tensor, it's data type should be float32, float64.
+        label(Tensor): the label value corresponding to input, it's data type should be float32, float64.
+        margin (float, optional): The margin value to add, default value is 0;
+        reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns: Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype('float32'))
+            other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype('float32'))
+            label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype('float32'))
+            loss = paddle.nn.functional.margin_ranking_loss(input, other, label)
+            print(loss.numpy()) # [0.75]
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction)
+    if fluid.framework.in_dygraph_mode():
+        out = core.ops.elementwise_sub(other, input)
+        out = core.ops.elementwise_mul(out, label)
+        if margin != 0.0:
+            margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
+            out = core.ops.elementwise_add(out, margin)
+        out = core.ops.relu(out)
+        if reduction == 'sum':
+            return core.ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == 'mean':
+            return core.ops.mean(out)
+        return out
+
+    helper = LayerHelper("margin_ranking_loss", **locals())
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'margin_rank_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        other, 'other', ['float32', 'float64'], 'margin_rank_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'margin_rank_loss')
+
+    out = paddle.elementwise_sub(other, input)
+    out = paddle.multiply(out, label)
+
+    if margin != 0.0:
+        margin_var = out.block.create_var(dtype=out.dtype)
+        paddle.fill_constant([1], out.dtype, margin, out=margin_var)
+        out = paddle.add(out, margin_var)
+
+    result_out = helper.create_variable_for_type_inference(input.dtype)
+
+    if reduction == 'none':
+        helper.append_op(
+            type="relu", inputs={"X": out}, outputs={"Out": result_out})
+        return result_out
+    elif reduction == 'sum':
+        out = paddle.nn.functional.relu(out)
+        attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
+        helper.append_op(
+            type="reduce_sum",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs=attrs)
+        return result_out
+    elif reduction == 'mean':
+        out = paddle.nn.functional.relu(out)
+        helper.append_op(
+            type="mean",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs={})
+        return result_out
+
+
+def l1_loss(input, label, reduction='mean', name=None):
+    """
+    This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
+
+    If `reduction` set to ``'none'``, the loss is:
+
+    .. math::
+        Out = \lvert input - label\rvert
+
+    If `reduction` set to ``'mean'``, the loss is:
+
+    .. math::
+        Out = MEAN(\lvert input - label\rvert)
+
+    If `reduction` set to ``'sum'``, the loss is:
+
+    .. math::
+        Out = SUM(\lvert input - label\rvert)
+
+
+    Parameters:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        reduction (str, optional): Indicate the reduction to apply to the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+    Returns:
+        Tensor, the L1 Loss of Tensor ``input`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+
+            l1_loss = paddle.nn.functional.l1_loss(input, label)
+            print(l1_loss.numpy())
+            # [0.35]
+
+            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='none')
+            print(l1_loss.numpy())
+            # [[0.20000005 0.19999999]
+            # [0.2        0.79999995]]
+
+            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+            print(l1_loss.numpy())
+            # [1.4]
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction)
+
+    if in_dygraph_mode():
+        unreduced = _elementwise_op_in_dygraph(
+            input, label, axis=-1, act='abs', op_name='elementwise_sub')
+        if reduction == 'mean':
+            return core.ops.mean(unreduced)
+        elif reduction == 'sum':
+            return core.ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim', False,
+                                       'reduce_all', True)
+        else:
+            return unreduced
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
+
+    if reduction == 'sum':
+        unreduced = paddle.elementwise_sub(input, label, act='abs')
+        return paddle.sum(unreduced, name=name)
+    elif reduction == 'mean':
+        unreduced = paddle.elementwise_sub(input, label, act='abs')
+        return paddle.mean(unreduced, name=name)
+    else:
+        return paddle.elementwise_sub(input, label, act='abs', name=name)
+
+
+def nll_loss(input,
+             label,
+             weight=None,
+             ignore_index=-100,
+             reduction='mean',
+             name=None):
+    """
+    This api returns negative log likelihood.
+    See more detail in :ref:`api_nn_loss_NLLLoss` .
+
+    Parameters:
+         input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
+             But in K-dimension situation, the shape is :math:`[N, C, d_1, d_2, ..., d_K]`.
+             The data type is float32, float64.
+         label (Tensor): Label tensor, the shape is :math:`[N,]` or :math:`[N, d_1, d_2, ..., d_K]`.
+             The data type is int64.
+         weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+             to each class. If given, it has to be a 1D Tensor whose size is `[C, ]`. Otherwise,
+             it treated as if having all ones. the data type is
+             float32, float64, Default is ``'None'``.
+         ignore_index (int64, optional): Specifies a target value that is ignored
+             and does not contribute to the input gradient.
+         reduction (str, optional): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
+         name (str, optional): Name for the operation (optional, default is None).
+             For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+         `Tensor`, the value of negative log likelihood loss.
+
+    Examples:
+        .. code-block:: python
+                import paddle
+                import numpy as np
+                from paddle.nn.functional import nll_loss
+                log_softmax = paddle.nn.LogSoftmax(axis=1)
+
+                input_np = np.array([[0.88103855, 0.9908683 , 0.6226845 ],
+                                     [0.53331435, 0.07999352, 0.8549948 ],
+                                     [0.25879037, 0.39530203, 0.698465  ],
+                                     [0.73427284, 0.63575995, 0.18827209],
+                                     [0.05689114, 0.0862954 , 0.6325046 ]]).astype(np.float32)
+                label_np = np.array([0, 2, 1, 1, 0]).astype(np.int64)
+
+                place = paddle.CPUPlace()
+                paddle.disable_static(place)
+                input = paddle.to_variable(input_np)
+                log_out = log_softmax(input)
+                label = paddle.to_variable(label_np)
+                result = nll_loss(log_out, label)
+                print(result.numpy()) # [1.0720209]
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
+            "'none', but received %s, which is not allowed." % reduction)
+
+    input_shape = list(input.shape)
+    input_dims = len(input_shape)
+    if input_dims < 2:
+        raise ValueError('Expected 2 or more dimensions (got {})'.format(
+            input_dims))
+    n = input_shape[0]
+    c = input_shape[1]
+    if in_dygraph_mode():
+        if input_dims != 2 and input_dims != 4:
+            input, _ = core.ops.reshape2(input, 'shape', [n, c, 1, -1])
+            label, _ = core.ops.reshape2(label, 'shape', [n, 1, -1])
+            out_shape = [n] + input_shape[2:]
+        out, total_weight = core.ops.nll_loss(input, label, weight,
+                                              'ignore_index', ignore_index,
+                                              'reduction', reduction)
+        if input_dims != 2 and input_dims != 4 and reduction == 'none':
+            out, _ = core.ops.reshape2(out, 'shape', out_shape)
+        return out
+
+    helper = LayerHelper('nll_loss', **locals())
+
+    if input_dims != 2 and input_dims != 4:
+        input = reshape(input, shape=[n, c, 1, -1])
+        label = reshape(label, shape=[n, 1, -1])
+        out_shape = [n] + input_shape[2:]
+
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'nll_loss')
+    fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                               'nll_loss')
+    inputs = {'X': input, 'Label': label}
+    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+    if weight is not None:
+        if isinstance(weight, Variable):
+            inputs['Weight'] = weight
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
+    outputs = {'Out': out, 'Total_weight': total_weight}
+
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
+    if input_dims != 2 and input_dims != 4 and reduction == 'none':
+        out = reshape(out, shape=out_shape)
+
+    return out
+
+
+def kl_div(input, label, reduction='mean', name=None):
+    """
+    This operator calculates the Kullback-Leibler divergence loss
+    between Input(X) and Input(Target). Notes that Input(X) is the
+    log-probability and Input(Target) is the probability.
+
+    KL divergence loss is calculated as follows:
+
+    $$l(x, y) = y * (\log(y) - x)$$
+
+    While :math:`x` is input and :math:`y` is label.
+
+    While :attr:`reduction` is :attr:`none`, output loss is in
+    the same shape as input, loss in each point is calculated
+    seperately and no reduction is applied.
+
+    While :attr:`reduction` is :attr:`mean`, output loss is in
+    shape of [1] and loss value is the mean value of all losses.
+
+    While :attr:`reduction` is :attr:`sum`, output loss is in
+    shape of [1] and loss value is the sum value of all losses.
+
+    While :attr:`reduction` is :attr:`batchmean`, output loss is
+    in shape of [1] and loss value is the sum value of all losses
+    divided by batch size.
+
+    Args:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means
+             any number of additional dimensions. It's data type should be float32, float64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64.
+        reduction (Tensor): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
+        name(str, optional): Name for the operation (optional, default is None). For more information,
+            please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The KL divergence loss. The data type is same as input tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            import paddle.nn.functional as F
+
+            paddle.enable_imperative()
+
+            shape = (5, 20)
+            input = np.random.uniform(-10, 10, shape).astype('float32')
+            target = np.random.uniform(-10, 10, shape).astype('float32')
+
+            # 'batchmean' reduction, loss shape will be [N]
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='batchmean')
+            # shape=[5]
+
+            # 'mean' reduction, loss shape will be [1]
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='mean')
+            # shape=[1]
+
+            # 'sum' reduction, loss shape will be [1]
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='sum')
+            # shape=[1]
+
+            # 'none' reduction, loss shape is same with input shape
+            pred_loss = F.kl_div(paddle.to_variable(input),
+                                 paddle.to_variable(target), reduction='none')
+            # shape=[5, 20]
+
+    """
+    if paddle.in_dynamic_mode():
+        out = core.ops.kldiv_loss(input, label, 'reduction', reduction)
+        return out
+
+    helper = LayerHelper('kl_div', **locals())
+
+    fluid.data_feeder.check_variable_and_dtype(input, 'input',
+                                               ['float32', 'float64'], 'kl_div')
+    fluid.data_feeder.check_variable_and_dtype(label, 'label',
+                                               ['float32', 'float64'], 'kl_div')
+    fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
+
+    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='kldiv_loss',
+        inputs={'X': input,
+                'Target': label},
+        outputs={'Loss': loss},
+        attrs={'reduction': reduction})
+    return loss
+
+
+def mse_loss(input, label, reduction='mean', name=None):
+    """
+    This op accepts input predications and label and returns the mean square error.
+
+    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
+
+    .. math::
+        Out = (input - label)^2
+
+    If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
+
+    .. math::
+        Out = \operatorname{mean}((input - label)^2)
+
+    If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
+
+    .. math::
+        Out = \operatorname{sum}((input - label)^2)
+
+    Parameters:
+        input (Tensor): Input tensor, the data type should be float32 or float64.
+        label (Tensor): Label tensor, the data type should be float32 or float64.
+        reduction (string, optional): The reduction method for the output,
+            could be 'none' | 'mean' | 'sum'.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+
+    Returns:
+        Tensor: The tensor tensor storing the mean square error difference of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+
+        .. code-block:: python
+            import numpy as np
+            import paddle
+
+
+            # static graph mode
+            paddle.enable_static()
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.data(name="input", shape=[1])
+            label = paddle.data(name="label", shape=[1])
+            place = paddle.CPUPlace()
+            input_data = np.array([1.5]).astype("float32")
+            label_data = np.array([1.7]).astype("float32")
+
+            output = mse_loss(input,label)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            output_data = exe.run(
+                paddle.static.default_main_program(),
+                feed={"input":input_data, "label":label_data},
+                fetch_list=[output],
+                return_numpy=True)
+            print(output_data)
+            # [array([0.04000002], dtype=float32)]
+
+            # dynamic graph mode
+            paddle.disable_static()
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+            output = mse_loss(input, label)
+            print(output.numpy())
+            # [0.04000002]
+
+    """
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction))
+
+    if not paddle.fluid.framework.in_dygraph_mode():
+        paddle.fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'mse_loss')
+        paddle.fluid.data_feeder.check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'mse_loss')
+
+    if reduction == 'none':
+        return paddle.fluid.layers.square(
+            paddle.fluid.layers.elementwise_sub(input, label), name=name)
+    elif reduction == 'mean':
+        return paddle.mean(
+            paddle.fluid.layers.square(
+                paddle.fluid.layers.elementwise_sub(input, label)),
+            name=name)
+    else:
+        return paddle.sum(paddle.fluid.layers.square(
+            paddle.fluid.layers.elementwise_sub(input, label)),
+                          name=name)
+
+
+def ctc_loss(log_probs,
+             labels,
+             input_lengths,
+             label_lengths,
+             blank=0,
+             reduction='mean'):
+    """
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+
+    Returns:
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
+
+    Examples:
+
+        .. code-block:: python
+
+            # declarative mode
+            import paddle.nn.functional as F
+            import numpy as np
+            import paddle
+
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_tensor(log_probs)
+            labels = paddle.to_tensor(labels)
+            input_lengths = paddle.to_tensor(input_lengths)
+            label_lengths = paddle.to_tensor(label_lengths)
+
+            loss = F.ctc_loss(log_probs, labels,
+                input_lengths,
+                label_lengths,
+                blank=0,
+                reduction='none')
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = F.ctc_loss(log_probs, labels,
+                input_lengths,
+                label_lengths,
+                blank=0,
+                reduction='mean')
+            print(loss.numpy())  #[1.1376063]
+
+    """
+
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+                                    input_lengths, label_lengths)
+
+    loss_out = fluid.layers.squeeze(loss_out, [-1])
+    assert reduction in ['mean', 'sum', 'none']
+    if reduction == 'mean':
+        loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
+                                                      loss_out.dtype))
+    elif reduction == 'sum':
+        loss_out = paddle.sum(loss_out)
+    return loss_out
+
+
+def cross_entropy(input,
+                  label,
+                  weight=None,
+                  ignore_index=-100,
+                  reduction='mean'):
+    """
+    This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
+    and ``NLLLoss`` together.
+
+    It is useful when training a classification problem with ``C`` classes.
+    If provided, the optional argument ``weight`` should be a 1D Variable assigning
+    weight to each of the classes.
+
+    For predictions label, and target label, the loss is calculated as follows.
+
+    .. math::
+
+        loss_j =  -\\text{input[class]} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right), j = 1,..., K
+
+    If weight is not ``None``:
+
+    .. math::
+
+        loss_j =  \\text{weight[class]}(-\\text{input[class]} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right)), j = 1,..., K
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32, float64. Shape is
+	    (N, C), where C is number of classes, and if shape is more than 2D, this
+	    is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+	    (N, D1, D2,..., Dk), k >= 1.
+        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+            to each class and the shape is (C). It has the same dimensions as class
+	    number and the data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default is ``-100``.
+
+    Returns:
+        The tensor variable storing the cross_entropy_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+            input_data = np.random.random([5, 100]).astype("float64")
+            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
+            weight_data = np.random.random([100]).astype("float64")
+            input =  paddle.to_tensor(input_data)
+            label =  paddle.to_tensor(label_data)
+            weight = paddle.to_tensor(weight_data)
+            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
+            print(loss.numpy())
+
+    """
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'cross_entropy_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'cross_entropy_loss')
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in cross_entropy_loss should be 'sum', 'mean' or"
+            " 'none', but received %s, which is not allowed." % reduction)
+
+    #step 1. log_softmax
+    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    if weight is not None and not isinstance(weight, Variable):
+        raise ValueError(
+            "The weight' is not a Variable, please convert to Variable.")
+
+    #step 2. nll_loss
+    input = log_softmax_out
+    helper = LayerHelper('nll_loss', **locals())
+    dtype = helper.input_dtype(input)
+
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'nll_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'nll_loss')
+
+    x_shape = list(input.shape)
+    n = x_shape[0]
+    c = x_shape[1]
+    x_dims = len(x_shape)
+    if x_dims < 2:
+        raise ValueError('Expected 2 or more dimensions (got {})'.format(
+            x_dims))
+    if x_dims != 2 and x_dims != 4:
+        input = reshape(input, shape=[n, c, 1, -1])
+        label = reshape(label, shape=[n, 1, -1])
+        out_shape = [n] + x_shape[2:]
+
+    if not in_dygraph_mode():
+        fluid.data_feeder.check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'nll_loss')
+        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
+                                                   'nll_loss')
+    inputs = {'X': input, 'Label': label}
+    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+    if weight is not None:
+        if isinstance(weight, Variable):
+            inputs['Weight'] = weight
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
+    outputs = {'Out': out, 'Total_weight': total_weight}
+
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
+    if x_dims != 2 and x_dims != 4 and reduction == 'none':
+        out = reshape(out, shape=out_shape)
+
+    return out
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 04b031b91ce387c1d8266d53725090d23b592f8c..13e86e5712a1cd5c014517e37d3803ca24cfb6fb 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -13,16 +13,386 @@
 # limitations under the License.
 
 # TODO: define normalization api  
+import paddle
+import paddle.fluid as fluid
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.framework import in_dygraph_mode, core
+from ...framework import create_parameter
 from ...fluid.layers import l2_normalize  #DEFINE_ALIAS
 from ...fluid.layers import lrn  #DEFINE_ALIAS
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils
 
 __all__ = [
-    #       'batch_norm',
+    'batch_norm',
     #       'data_norm',
-    #       'group_norm',
-    #       'instance_norm',
+    'instance_norm',
     'l2_normalize',
-    #       'layer_norm',
+    'layer_norm',
     'lrn',
+    'normalize',
     #       'spectral_norm'
 ]
+
+
+def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
+    """
+    This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
+
+    .. math::
+
+        y = \frac{x}{ \max\left( \lvert \lvert x \rvert \rvert_p, epsilon\right) }
+    
+    .. math::
+        \lvert \lvert x \rvert \rvert_p = \left(\sum_i {\lvert x_i\rvert^p}  \right)^{1/p}
+
+    where, :math:`\sum_i{\lvert x_i\rvert^p}` is calculated along the ``axis`` dimension.
+
+
+    Args:
+        x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
+        p (float|int, optional): The exponent value in the norm formulation. Default: 2
+        axis (int, optional): The axis on which to apply normalization. If ``x`` is 1-D tensor, ``axis`` is fixed to 0. If `axis < 0`, \
+            the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
+        epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the output has the same shape and data type with ``x``.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.disable_static()
+            x = np.arange(6, dtype=np.float32).reshape(2,3)
+            x = paddle.to_variable(x)
+            y = F.normalize(x)
+            print(y.numpy())
+            # [[0.         0.4472136  0.8944272 ]
+            # [0.42426404 0.5656854  0.7071067 ]]
+
+            y = F.normalize(x, p=1.5)
+            print(y.numpy())
+            # [[0.         0.40862012 0.81724024]
+            # [0.35684016 0.4757869  0.5947336 ]]
+
+            y = F.normalize(x, axis=0)
+            print(y.numpy())
+            # [[0.         0.24253564 0.37139067]
+            # [1.         0.97014254 0.9284767 ]]
+    """
+    if len(x.shape) == 1:
+        axis = 0
+    if in_dygraph_mode():
+        eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
+        out = core.ops.p_norm(x, 'axis', axis, 'porder',
+                              float(p), 'keepdim', True, 'epsilon', epsilon)
+        return x / core.ops.elementwise_max(out, eps)
+
+    check_type(p, 'p', (float, int), 'normalize')
+    check_type(axis, 'axis', (int), 'normalize')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'normalize')
+
+    attrs = {
+        'axis': axis,
+        'porder': float(p),
+        'keepdim': True,
+        'epsilon': epsilon,
+    }
+    helper = LayerHelper('p_norm', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    eps = out.block.create_var(dtype=out.dtype)
+    paddle.fill_constant([1], out.dtype, epsilon, out=eps)
+    return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+
+
+def batch_norm(x,
+               running_mean,
+               running_var,
+               weight,
+               bias,
+               training=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               data_format="NCHW",
+               name=None):
+    """
+    Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
+    
+    Parameters:
+        x(Tesnor): input value. It's data type should be float32, float64.
+        running_mean(Tensor): running mean.
+        running_var(Tensor): running variance.
+        weight(Tensor): The weight tensor of batch_norm, can not be None.
+        bias(Tensor): The bias tensor of batch_norm can not be None. 
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          x = np.random.seed(123)
+          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          running_mean = np.random.random(size=1).astype('float32')
+          running_variance = np.random.random(size=1).astype('float32')
+          weight_data = np.random.random(size=1).astype('float32')
+          bias_data = np.random.random(size=1).astype('float32')
+          x = paddle.to_tensor(x)
+          rm = paddle.to_tensor(running_mean)
+          rv = paddle.to_tensor(running_variance)
+          w = paddle.to_tensor(weight_data)
+          b = paddle.to_tensor(bias_data)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
+          print batch_norm_out
+    """
+
+    assert len(x.shape) >= 2, "input dim must be larger than 1"
+
+    # we use not training means use_global_status, more details see nn._BatchNormBase
+    use_global_stats = not training
+    # input ad out must share the memory
+    mean_out = running_mean
+    variance_out = running_var
+
+    if in_dygraph_mode():
+        # for dygraph need tuple
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            x, weight, bias, running_mean, running_var, mean_out, variance_out,
+            *attrs)
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'BatchNorm')
+
+    # for static need dict
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
+    inputs = {
+        "X": [x],
+        "Scale": [weight],
+        "Bias": [bias],
+        "Mean": [running_mean],
+        "Variance": [running_var]
+    }
+
+    helper = LayerHelper('batch_norm', **locals())
+
+    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {
+        "Y": [batch_norm_out],
+        "MeanOut": [running_mean],
+        "VarianceOut": [running_var],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return helper.append_activation(batch_norm_out)
+
+
+def layer_norm(x,
+               normalized_shape,
+               weight=None,
+               bias=None,
+               epsilon=1e-05,
+               name=None):
+    """
+    see more detail in paddle.nn.LayerNorm
+    
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+    input_shape = list(x.shape)
+    input_ndim = len(input_shape)
+    normalized_ndim = len(normalized_shape)
+    begin_norm_axis = input_ndim - normalized_ndim
+    if input_ndim < normalized_ndim or input_shape[
+            begin_norm_axis:] != normalized_shape:
+        str_normalized_shape = str(normalized_shape)
+        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
+                         ', expected input with shape [*, ' +
+                         str_normalized_shape[
+                             1:] + ', but got input shape ' + str(input_shape))
+
+    if in_dygraph_mode():
+        pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
+                                            'begin_norm_axis', begin_norm_axis)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
+
+    inputs = dict()
+    inputs['X'] = [x]
+    if weight:
+        inputs['Scale'] = [weight]
+    if bias:
+        inputs['Bias'] = [bias]
+    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+    # create output
+    helper = LayerHelper('layer_norm', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
+def instance_norm(x,
+                  running_mean=None,
+                  running_var=None,
+                  weight=None,
+                  bias=None,
+                  use_input_stats=True,
+                  momentum=0.9,
+                  eps=1e-05,
+                  data_format="NCHW",
+                  name=None):
+    """
+    See more detail in nn.layer.InstanceNorm2d.
+
+    Parameters:
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
+        running_mean(Tensor): running mean. Default None.
+        running_var(Tensor): running variance. Default None.
+        weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm_out = paddle.nn.functional.instancenorm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                           "momentum", momentum, "data_format",
+                                           data_format)
+        return out
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+
+    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
+
+    if weight and bias:
+        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+    else:
+        inputs = {"X": [x]}
+
+    helper = LayerHelper('instance_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [instance_norm_out],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    return instance_norm_out
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
old mode 100644
new mode 100755
index 618145fb1fad47e2105edf6186bb4606494d57c9..ca657b8be3e67c7acb795a0f427ca5fe2c57b1f2
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -17,5 +17,1447 @@ from ...fluid.layers import pool2d  #DEFINE_ALIAS
 from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool2d  #DEFINE_ALIAS
 from ...fluid.layers import adaptive_pool3d  #DEFINE_ALIAS
+from ...fluid import core
+from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
+from ...fluid.layers import utils, LayerHelper
+from ...fluid.data_feeder import check_type, check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from ...fluid.layers import unsqueeze, squeeze
 
-__all__ = ['pool2d', 'pool3d', 'adaptive_pool2d', 'adaptive_pool3d']
+__all__ = [
+    'pool2d',
+    'pool3d',
+    'avg_pool1d',
+    'max_pool1d',
+    'adaptive_avg_pool1d',
+    'adaptive_max_pool1d',
+    'adaptive_avg_pool2d',
+    'adaptive_avg_pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
+    'max_pool2d',
+    'avg_pool2d',
+    'max_pool3d',
+    'avg_pool3d',
+]
+
+
+def check_input(x, dimension):
+    if len(x.shape) != dimension:
+        raise ValueError("Excepted Input X is 3-D tensor, but received {}-D {}".
+                         format(len(x.shape), type(x)))
+
+
+def check_instance(x, x_name, types=(int, float)):
+
+    if not isinstance(x, types):
+        raise ValueError("Excepted {} type for {} but received type: {}. ".
+                         format(types, x_name, type(x)))
+
+
+def update_padding1d(padding, pool_type='avg'):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, list) or isinstance(ele, tuple):
+            return True
+        return False
+
+    if is_list_or_tuple(padding):
+        if padding.__len__() == 1 and not is_list_or_tuple(padding[0]):
+            return [0, padding[0]]
+        else:
+            raise ValueError(
+                "{}_pool1d() argument 'padding' should contain one int (got {})".
+                format(pool_type, padding.__len__()))
+    else:
+        padding = [0, padding]
+
+    return padding
+
+
+def update_padding2d(padding, data_format):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, list) or isinstance(ele, tuple):
+            return True
+        return False
+
+    if is_list_or_tuple(padding) and len(padding) == 4:
+        if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
+            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[2:4]
+            padding = [ele for a_list in padding for ele in a_list]
+        elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
+            if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[1:3]
+            padding = [ele for a_list in padding for ele in a_list]
+        padding = utils.convert_to_list(padding, 4, 'padding')
+
+        if utils._is_symmetric_padding(padding, 2):
+            padding = [padding[0], padding[2]]
+    else:
+        padding = utils.convert_to_list(padding, 2, 'padding')
+
+    return padding
+
+
+def update_padding3d(padding, data_format):
+    def is_list_or_tuple(ele):
+        if isinstance(ele, (list, tuple)):
+            return True
+        return False
+
+    if is_list_or_tuple(padding) and len(padding) == 5:
+        if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"):
+            if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[2:5]
+            padding = [ele for a_list in padding for ele in a_list]
+        elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
+            if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
+                raise ValueError(
+                    "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                    "is not supported." % str(padding))
+            padding = padding[1:4]
+            padding = [ele for a_list in padding for ele in a_list]
+        padding = utils.convert_to_list(padding, 6, 'padding')
+        if utils._is_symmetric_padding(padding, 3):
+            padding = [padding[0], padding[2], padding[4]]
+
+    elif is_list_or_tuple(padding) and len(padding) == 6:
+        padding = utils.convert_to_list(padding, 6, 'padding')
+        if utils._is_symmetric_padding(padding, 3):
+            padding = [padding[0], padding[2], padding[4]]
+    else:
+        padding = utils.convert_to_list(padding, 3, 'padding')
+
+    return padding
+
+
+def avg_pool1d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               count_include_pad=True,
+               ceil_mode=False,
+               name=None):
+    """
+
+    This operation applies a 1D average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
+            then the input is implicitly zero-padded on both sides for padding number of points.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.avg_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'avg_pool1d')
+    check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = utils.convert_to_list(kernel_size, 1, 'pool_size')
+    kernel_size = [1] + kernel_size
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 1, 'pool_stride')
+        stride = [1] + stride
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0]
+
+    padding = update_padding1d(padding, "avg")
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', not count_include_pad, 'ceil_mode',
+            ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format',
+            data_format)
+        return squeeze(output, [2])
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    return squeeze(pool_out, [2])
+
+
+def max_pool1d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               name=None):
+    """
+
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L], where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+          # pool_out shape: [1, 3, 16]
+
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    data_format = "NCHW"
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'max_pool1d')
+    check_input(x, 3)
+    x = unsqueeze(x, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0]
+
+    padding = update_padding1d(padding, 'max')
+
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    op_type = 'max_pool2d_with_index'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def adaptive_avg_pool1d(x, output_size, name=None):
+    """
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+        lstart &= floor(i * L_{in} / L_{out})
+
+        lend &= ceil((i + 1) * L_{in} / L_{out})
+
+        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+
+    Returns:
+            Tensor: The output tensor of adaptive average pooling result. The data type is same
+                      as input tensor.
+
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+              # average adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+    """
+    pool_type = 'avg'
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
+                             'adaptive_pool2d')
+    check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = "pool2d"
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
+                                   pool_size, 'adaptive', True)
+        return squeeze(pool_out, [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return squeeze(pool_out, [2])
+
+
+def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+    """
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+        lstart &= floor(i * L_{in} / L_{out})
+
+        lend &= ceil((i + 1) * L_{in} / L_{out})
+
+        Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
+                              with shape [N, C, L].  The format of input tensor is NCL,
+                              where N is batch size, C is the number of channels, L is the
+                              length of the feature. The data type is float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+                it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+                with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                                 to :ref:`api_guide_Name`. Usually name is no need to set and
+                                 None by default.
+
+    Returns:
+            Tensor: The output tensor of adaptive pooling result. The data type is same
+                      as input tensor.
+
+    Raises:
+            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+              # max adaptive pool1d
+              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+              # output shape is [N, C, m], adaptive pool divide L dimension
+              # of input data into m grids averagely and performs poolings in each
+              # grid to get output.
+              # adaptive max pool performs calculations as follow:
+              #
+              #     for i in range(m):
+              #         lstart = floor(i * L / m)
+              #         lend = ceil((i + 1) * L / m)
+              #         output[:, :, i] = max(input[:, :, lstart: lend])
+              #
+              import paddle
+              import paddle.nn.functional as F
+              paddle.disable_static()
+
+              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+              pool_out = F.adaptive_max_pool1d(data, output_size=16)
+              # pool_out shape: [1, 3, 16])
+
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+
+    """
+    pool_type = 'max'
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'],
+                             'adaptive_max_pool1d')
+    check_input(x, 3)
+    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+
+    pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
+
+    l_type = 'max_pool2d_with_index'
+
+    x = unsqueeze(x, [2])
+    if in_dygraph_mode():
+        pool_out = core.ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        return (squeeze(pool_out[0], [2]), squeeze(
+            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (squeeze(pool_out, [2]),
+            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+
+
+def max_pool2d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               data_format="NCHW",
+               name=None):
+    """
+    This operation applies 2D max pooling over input feature based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+           stride: stride
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # output.shape [1, 3, 16, 16]
+
+          # for return_indices=True
+          output, max_indices = F.max_pool2d(input,
+                                             kernel_size=2,
+                                             stride=2,
+                                             padding=0,
+                                             return_indices=True)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    padding = update_padding2d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.max_pool2d_with_index(
+            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
+            'paddings', padding, 'padding_algorithm', padding_algorithm,
+            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
+            'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
+
+    op_type = 'max_pool2d_with_index'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
+
+
+def avg_pool2d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=True,
+               divisor_override=None,
+               data_format="NCHW",
+               name=None):
+    """
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # avg pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          output = F.avg_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          # output.shape [1, 3, 16, 16]
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool2d')
+    kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 2, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+    pool_padding = update_padding2d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
+            False, 'padding_algorithm', padding_algorithm, 'strides', stride,
+            'paddings', pool_padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1]) / divisor_override
+
+    op_type = 'pool2d'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": "avg",
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    if divisor_override is None:
+        return pool_out
+    else:
+        check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
+
+
+def max_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               return_indices=False,
+               ceil_mode=False,
+               data_format="NCDHW",
+               name=None):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+           $$
+           \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, ksize[0]-1} \max_{m=0, \ldots, ksize[1]-1} \max_{n=0, \ldots, ksize[2]-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+           $$
+
+    Args:
+        x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of
+                          input tensor is `"NCDHW"` or `"NDHWC"`, where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
+                          of the feature.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn.functional as F
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output = F.max_pool2d(input,
+                                kernel_size=2,
+                                stride=2, padding=0)
+          output.shape [1, 3, 16, 16, 16]
+
+          # for return_indices=True
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          output, max_indices = paddle.nn.functional.max_pool3d(input,
+                                        kernel_size = 2,
+                                        stride = 2,
+                                        padding=0,
+                                        return_indices=True)
+          # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0]
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
+    padding = update_padding3d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.max_pool3d_with_index(
+            x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', True, 'data_format', data_format)
+        return output if return_indices else output[0]
+
+    op_type = "max_pool3d_with_index"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    mask = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out, "Mask": mask}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": False,
+            "data_format": data_format,
+        })
+
+    return (pool_out, mask) if return_indices else pool_out
+
+
+def avg_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               count_include_pad=False,
+               divisor_override=None,
+               data_format="NCDHW",
+               name=None):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
+                          shape [N, C, D, H, W], where `N` is batch size, `C` is
+                          the number of channels, `D` is the depth of the feature,
+                          `H` is the height of the feature, and `W` is the width
+                          of the feature.
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          import paddle
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+          # avg pool3d
+          pool3d = paddle.nn.functional.avg_pool3d(
+                                            input,
+                                            kernel_size = 2,
+                                            stride = 2,
+                                            padding=0)
+          # pool3d.shape: [1, 3, 16, 16, 16]
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
+                    "Received ceil_mode: True.")
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0, 0]
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s" % str(data_format))
+    padding = update_padding3d(padding, data_format)
+
+    if in_dygraph_mode():
+        output = core.ops.pool3d(
+            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
+            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
+            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+            'use_mkldnn', False, 'exclusive', not count_include_pad,
+            'data_format', data_format)
+        if divisor_override is None:
+            return output
+        else:
+            check_instance(divisor_override, "divisor_override")
+            return output * (kernel_size[0] * kernel_size[1] *
+                             kernel_size[2]) / divisor_override
+
+    op_type = "pool3d"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": not count_include_pad,
+            "data_format": data_format,
+        })
+
+    if divisor_override is None:
+        return pool_out
+    else:
+        check_instance(divisor_override, "divisor_override")
+        return pool_out * (kernel_size[0] * kernel_size[1] *
+                           kernel_size[2]) / divisor_override
+
+
+def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool2d` .
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+    Args:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+                          The data type can be float32 or float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            pool_out = paddle.nn.functional.adaptive_avg_pool2d(
+                            x = x,
+                            output_size=[3, 3])
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_avg_pool2d')
+    check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    if data_format == "NCHW":
+        in_h, in_w = x.shape[2:4]
+    else:
+        in_h, in_w = x.shape[1:3]
+
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 2, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_h
+        if output_size[1] == None:
+            output_size[1] = in_w
+
+    if in_dygraph_mode():
+        output = core.ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
+                                 'global_pooling', False, 'adaptive', True,
+                                 'data_format', data_format)
+        return output
+
+    l_type = 'pool2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        })
+
+    return pool_out
+
+
+def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+    See more detail in :ref:`api_nn_pooling_AdaptiveAvgPool3d` .
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+    Args:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+                          The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+
+    Raises:
+        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+                            x = x,
+                            output_size=[3, 3, 3])
+            # pool_out.shape is [2, 3, 3, 3, 3]
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'adaptive_avg_pool3d')
+    check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
+
+    if data_format not in ["NCDHW", "NDHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    if data_format == "NCDHW":
+        in_l, in_h, in_w = x.shape[2:5]
+    else:
+        in_l, in_h, in_w = x.shape[1:4]
+
+    if isinstance(output_size, int):
+        output_size = utils.convert_to_list(output_size, 3, 'output_size')
+    else:
+        if output_size[0] == None:
+            output_size[0] = in_l
+        if output_size[1] == None:
+            output_size[1] = in_h
+        if output_size[2] == None:
+            output_size[2] = in_w
+
+    if in_dygraph_mode():
+        output = core.ops.pool3d(x, 'pooling_type', 'avg', 'ksize', output_size,
+                                 'global_pooling', False, 'adaptive', True,
+                                 'data_format', data_format)
+        return output
+
+    l_type = 'pool3d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    outputs = {"Out": pool_out}
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        })
+
+    return pool_out
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index a2cc8fde5ad7147b7af4765de834508f1f3cc825..1dfdac26e990851ac5f192742acd47fb92633d0d 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -12,9 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ...device import get_cudnn_version
+from ...fluid.framework import core, in_dygraph_mode, Variable
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid import dygraph_utils
+import numpy as np
+
 # TODO: define specitial functions used in computer vision task  
 from ...fluid.layers import affine_channel  #DEFINE_ALIAS
-from ...fluid.layers import affine_grid  #DEFINE_ALIAS
 from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
 from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
 from ...fluid.layers import box_clip  #DEFINE_ALIAS
@@ -44,7 +50,7 @@ from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
 from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
-from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
+# from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
 from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
 from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
 from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
@@ -89,3 +95,313 @@ __all__ = [
     'yolo_box',
     'yolov3_loss'
 ]
+
+
+def affine_grid(theta, out_shape, align_corners=True, name=None):
+    """
+    It generates a grid of (x,y) coordinates using the parameters of
+    the affine transformation that correspond to a set of points where
+    the input feature map should be sampled to produce the transformed
+    output feature map.
+
+    Args:
+        theta (Tensor) - A tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters.
+                           The data type can be float32 or float64.
+        out_shape (Tensor | list | tuple): The shape of target output with format [batch_size, channel, height, width].
+                                             ``out_shape`` can be a Tensor or a list or tuple. The data
+                                             type must be int32.
+        align_corners(bool): Whether to align corners of target feature map and source feature map. Default: True.
+        name(str|None): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`.
+
+    Raises:
+        ValueError: If the type of arguments is not supported.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            paddle.disable_static()
+            # theta shape = [1, 2, 3]
+            theta = np.array([[[-0.7, -0.4, 0.3],
+                               [ 0.6,  0.5, 1.5]]]).astype("float32")
+            theta_t = paddle.to_tensor(theta)
+            y_t = F.affine_grid(
+                    theta_t,
+                    [1, 2, 3, 3],
+                    align_corners=False)
+            print(y_t.numpy())
+            
+            #[[[[ 1.0333333   0.76666665]
+            #   [ 0.76666665  1.0999999 ]
+            #   [ 0.5         1.4333333 ]]
+            #
+            #  [[ 0.5666667   1.1666666 ]
+            #   [ 0.3         1.5       ]
+            #   [ 0.03333333  1.8333334 ]]
+            #
+            #  [[ 0.10000002  1.5666667 ]
+            #   [-0.16666666  1.9000001 ]
+            #   [-0.43333334  2.2333333 ]]]]
+    """
+    helper = LayerHelper('affine_grid')
+
+    if not isinstance(theta, Variable):
+        raise ValueError("The theta should be a Tensor.")
+    check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
+                             'affine_grid')
+    cudnn_version = get_cudnn_version()
+    if cudnn_version is not None and cudnn_version >= 6000 and align_corners:
+        use_cudnn = True
+    else:
+        use_cudnn = False
+
+    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
+            isinstance(out_shape, Variable)):
+        raise ValueError("The out_shape should be a list, tuple or Tensor.")
+
+    if in_dygraph_mode():
+        _out_shape = out_shape.numpy().tolist() if isinstance(
+            out_shape, Variable) else out_shape
+        return core.ops.affine_grid(theta, "output_shape", _out_shape,
+                                    "align_corners", align_corners, "use_cudnn",
+                                    use_cudnn)
+
+    out = helper.create_variable_for_type_inference(theta.dtype)
+    ipts = {'Theta': theta}
+    attrs = {"align_corners": align_corners, "use_cudnn": use_cudnn}
+    if isinstance(out_shape, Variable):
+        ipts['OutputShape'] = out_shape
+        check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
+                                 'affine_grid')
+    else:
+        attrs['output_shape'] = out_shape
+
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
+def grid_sample(x,
+                grid,
+                mode='bilinear',
+                padding_mode='zeros',
+                align_corners=True,
+                name=None):
+    """
+    This operation samples input X by using bilinear interpolation or
+    nearest interpolation based on flow field grid, which is usually
+    generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
+    is the concatenation of (x, y) coordinates with shape [N, H, W] each,
+    where x is indexing the 4th dimension (in width dimension) of input
+    data x and y is indexing the 3rd dimension (in height dimension),
+    finally results is the bilinear interpolation or nearest value of 4 nearest corner
+    points. The output tensor shape will be [N, C, H, W].
+    .. code-block:: text
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+        .. code-block:: text
+            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points or nearest interpolate point value
+        by nearest point.
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+        For bilinear interpolation:
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+    Args:
+        x(Tensor): The input tensor, which is a 4-d tensor with shape
+                     [N, C, H, W], N is the batch size, C is the channel
+                     number, H and W is the feature height and width.
+                     The data type is float32 or float64.
+        grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
+                        data type is float32 or float64.
+        mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
+                         Default: 'bilinear'.
+        padding_mode(str, optional) The padding method used when source index
+                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   Default: zeros.
+        align_corners(bool, optional): If `align_corners` is true, it will projects
+                   -1 and 1 to the centers of the corner pixels. Otherwise, it will
+                   projects -1 and 1 to the image edges.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            
+            # shape=[1, 1, 3, 3]
+            x = np.array([[[[-0.6,  0.8, -0.5],
+                            [-0.5,  0.2,  1.2],
+                            [ 1.4,  0.3, -0.2]]]]).astype("float64")
+            
+            # grid shape = [1, 3, 4, 2]
+            grid = np.array(
+                         [[[[ 0.2,  0.3],
+                            [-0.4, -0.3],
+                            [-0.9,  0.3],
+                            [-0.9, -0.6]],
+                           [[ 0.4,  0.1],
+                            [ 0.9, -0.8],
+                            [ 0.4,  0.5],
+                            [ 0.5, -0.2]],
+                           [[ 0.1, -0.8],
+                            [-0.3, -1. ],
+                            [ 0.7,  0.4],
+                            [ 0.2,  0.8]]]]).astype("float64")
+            
+            paddle.disable_static()
+            x = paddle.to_tensor(x)
+            grid = paddle.to_tensor(grid)
+            y_t = F.grid_sample(
+                x,
+                grid,
+                mode='bilinear',
+                padding_mode='border',
+                align_corners=True)
+            print(y_t.numpy())
+            
+            # output shape = [1, 1, 3, 4]
+            # [[[[ 0.34   0.016  0.086 -0.448]
+            #    [ 0.55  -0.076  0.35   0.59 ]
+            #    [ 0.596  0.38   0.52   0.24 ]]]]
+    """
+    helper = LayerHelper("grid_sample", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
+    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
+                             'grid_sampler')
+    if not isinstance(x, Variable):
+        raise ValueError("The x should be a Variable")
+    if not isinstance(grid, Variable):
+        raise ValueError("The grid should be a Variable")
+    _modes = ['bilinear', 'nearest']
+    _padding_modes = ['zeros', 'reflect', 'border']
+    if mode not in _modes:
+        raise ValueError(
+            "The mode of grid sample function should be in {}, but got: {}".
+            format(_modes, mode))
+    if padding_mode not in _padding_modes:
+        raise ValueError(
+            "The padding mode of grid sample function should be in {}, but got: {}".
+            format(_padding_modes, padding_mode))
+
+    if not isinstance(align_corners, bool):
+        raise ValueError("The align corners should be bool, but got: {}".format(
+            align_corners))
+
+    cudnn_version = get_cudnn_version()
+    use_cudnn = False
+    if (cudnn_version is not None
+        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+        use_cudnn = True
+    ipts = {'X': x, 'Grid': grid}
+    attrs = {
+        'mode': mode,
+        'padding_mode': padding_mode,
+        'align_corners': align_corners,
+        'use_cudnn': use_cudnn
+    }
+
+    if in_dygraph_mode():
+        attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
+                 align_corners, 'use_cudnn', use_cudnn)
+        out = getattr(core.ops, 'grid_sampler')(x, grid, *attrs)
+    else:
+        out = helper.create_variable_for_type_inference(x.dtype)
+        helper.append_op(
+            type='grid_sampler',
+            inputs=ipts,
+            attrs=attrs,
+            outputs={'Output': out})
+    return out
+
+
+def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
+    """
+    This API implements pixel shuffle operation.
+    See more details in :ref:`api_nn_vision_PixelShuffle` .
+    Parameters:
+        x(Tensor): 4-D tensor, the data type should be float32 or float64.
+        upscale_factor(int): factor to increase spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+    Returns:
+        Out(tensor): Reshaped tensor according to the new dimension.
+    Raises:
+        ValueError: If the square of upscale_factor cannot divide the channels of input.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
+            paddle.disable_static()
+            x_var = paddle.to_tensor(x)
+            out_var = F.pixel_shuffle(x_var, 3)
+            out = out_var.numpy()
+            print(out.shape) 
+            # (2, 1, 12, 12)
+    """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'pixel_shuffle')
+
+    if not isinstance(upscale_factor, int):
+        raise TypeError("upscale factor must be int type")
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
+                         "But recevie Attr(data_format): {} ".format(
+                             data_format))
+
+    if in_dygraph_mode():
+        return core.ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
+                                      "data_format", data_format)
+
+    helper = LayerHelper("pixel_shuffle", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor,
+               "data_format": data_format})
+    return out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 4963ac360804f88dad9677e1dd9c05a5231c89b9..b25350be601dd9e56d8268859b52a12d3745c44d 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -20,6 +20,9 @@ from . import conv
 from . import extension
 from . import activation
 from . import norm
+from . import vision
+from . import distance
+from . import transformer
 
 from .activation import *
 from .loss import *
@@ -27,6 +30,9 @@ from .conv import *
 from .extension import *
 from .activation import *
 from .norm import *
+from .vision import *
+
+from .transformer import *
 # from .activation import PReLU        #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
 from .activation import LeakyReLU  #DEFINE_ALIAS
@@ -37,13 +43,40 @@ from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
 from .common import Pad2D  #DEFINE_ALIAS
+from .common import ReflectionPad1d  #DEFINE_ALIAS
+from .common import ReplicationPad1d  #DEFINE_ALIAS
+from .common import ConstantPad1d  #DEFINE_ALIAS
+from .common import ReflectionPad2d  #DEFINE_ALIAS
+from .common import ReplicationPad2d  #DEFINE_ALIAS
+from .common import ConstantPad2d  #DEFINE_ALIAS
+from .common import ZeroPad2d  #DEFINE_ALIAS
+from .common import ReplicationPad3d  #DEFINE_ALIAS
+from .common import ConstantPad3d  #DEFINE_ALIAS
+from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
+from .common import Flatten  #DEFINE_ALIAS
 from .common import UpSample  #DEFINE_ALIAS
-from .conv import Conv2D  #DEFINE_ALIAS
-from .conv import Conv2DTranspose  #DEFINE_ALIAS
-from .conv import Conv3D  #DEFINE_ALIAS
-from .conv import Conv3DTranspose  #DEFINE_ALIAS
+from .common import Dropout  #DEFINE_ALIAS
+from .common import Dropout2D  #DEFINE_ALIAS
+from .common import Dropout3D  #DEFINE_ALIAS
+from .common import AlphaDropout  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .pooling import AvgPool1d  #DEFINE_ALIAS
+from .pooling import MaxPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
+from .pooling import AvgPool2d  #DEFINE_ALIAS
+from .pooling import MaxPool2d  #DEFINE_ALIAS
+from .pooling import AvgPool3d  #DEFINE_ALIAS
+from .pooling import MaxPool3d  #DEFINE_ALIAS
+from .conv import Conv1d  #DEFINE_ALIAS
+from .conv import Conv2d  #DEFINE_ALIAS
+from .conv import Conv3d  #DEFINE_ALIAS
+from .conv import ConvTranspose1d  #DEFINE_ALIAS
+from .conv import ConvTranspose2d  #DEFINE_ALIAS
+from .conv import ConvTranspose3d  #DEFINE_ALIAS
 # from .conv import TreeConv        #DEFINE_ALIAS
 # from .conv import Conv1D        #DEFINE_ALIAS
 from .extension import RowConv  #DEFINE_ALIAS
@@ -55,12 +88,18 @@ from .extension import RowConv  #DEFINE_ALIAS
 # from .learning_rate import PiecewiseDecay        #DEFINE_ALIAS
 # from .learning_rate import PolynomialDecay        #DEFINE_ALIAS
 # from .loss import NCELoss        #DEFINE_ALIAS
+from .loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .loss import CrossEntropyLoss  #DEFINE_ALIAS
 from .loss import MSELoss  #DEFINE_ALIAS
 from .loss import L1Loss  #DEFINE_ALIAS
 from .loss import NLLLoss  #DEFINE_ALIAS
 from .loss import BCELoss  #DEFINE_ALIAS
+from .loss import KLDivLoss  #DEFINE_ALIAS
+from .loss import MarginRankingLoss  #DEFINE_ALIAS
+from .loss import CTCLoss  #DEFINE_ALIAS
+from .loss import SmoothL1Loss  #DEFINE_ALIAS
 from .norm import BatchNorm  #DEFINE_ALIAS
+from .norm import SyncBatchNorm  #DEFINE_ALIAS
 from .norm import GroupNorm  #DEFINE_ALIAS
 from .norm import LayerNorm  #DEFINE_ALIAS
 from .norm import SpectralNorm  #DEFINE_ALIAS
@@ -68,3 +107,6 @@ from .norm import InstanceNorm  #DEFINE_ALIAS
 # from .rnn import RNNCell        #DEFINE_ALIAS
 # from .rnn import GRUCell        #DEFINE_ALIAS
 # from .rnn import LSTMCell        #DEFINE_ALIAS
+
+from .vision import PixelShuffle  #DEFINE_ALIAS
+from .distance import PairwiseDistance  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 02a1d297e83ea4f21b3f1a9cb85b950e5959dc08..6ce732d95addba1af10ae38506ba0969975ae95d 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -15,19 +15,257 @@
 # TODO: define activation functions of neural network
 
 __all__ = [
-    #       'PReLU',
+    'ELU',
+    'GELU',
+    'Hardshrink',
+    'Tanh',
+    'Hardtanh',
+    'PReLU',
     'ReLU',
+    'ReLU6',
+    'SELU',
     'LeakyReLU',
     'Sigmoid',
-    #       'Softmax',
+    'Softmax',
+    'Softplus',
+    'Softshrink',
+    'Softsign',
+    'Tanhshrink',
+    'LogSigmoid',
     'LogSoftmax',
-    'HSigmoid'
+    'HSigmoid',
 ]
 
 from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
-from .. import functional
+from ...fluid.param_attr import ParamAttr
+from ...fluid.initializer import Constant
+from paddle.framework import get_default_dtype
+from .. import functional as F
+
+
+class ELU(layers.Layer):
+    """
+    ELU Activation.
+
+    .. math::
+    
+        ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+
+    Parameters:
+        alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
+            m = paddle.nn.ELU(0.2)
+            out = m(x)
+            # [[-0.12642411  6.        ]
+            #  [ 1.          15.6      ]]
+    """
+
+    def __init__(self, alpha=1.0, name=None):
+        super(ELU, self).__init__()
+        self._alpha = alpha
+        self._name = name
+
+    def forward(self, x):
+        return F.elu(x, self._alpha, self._name)
+
+
+class GELU(layers.Layer):
+    """
+    GELU Activation.
+
+    If approximate is True
+
+    .. math::
+
+        GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+
+    else
+
+    .. math::
+
+        GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+
+    Parameters:
+        approximate (bool, optional): Wether to enable approximation. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
+            
+            m = paddle.nn.GELU()
+            out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
+
+            m = paddle.nn.GELU(True)
+            out = m(x) # [-0.158808 0.345714 0.841192 1.39957]
+    """
+
+    def __init__(self, approximate=False, name=None):
+        super(GELU, self).__init__()
+        self._approximate = approximate
+        self._name = name
+
+    def forward(self, x):
+        return F.gelu(x, self._approximate, self._name)
+
+
+class Hardshrink(layers.Layer):
+    """
+    Hardshrink Activation
+
+    .. math::
+
+        hardshrink(x)=
+            \left\{
+            \begin{aligned}
+            &x, & & if \ x > threshold \\
+            &x, & & if \ x < -threshold \\
+            &0, & & if \ others
+            \end{aligned}
+            \right.
+
+    Parameters:
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(np.array([-1, 0.3, 2.5]))
+        m = paddle.nn.Hardshrink()
+        out = m(x) # [-1., 0., 2.5]
+    """
+
+    def __init__(self, threshold=0.5, name=None):
+        super(Hardshrink, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.hardshrink(x, self._threshold, self._name)
+
+
+class Tanh(layers.Layer):
+    """
+    Tanh Activation.
+
+    .. math::
+        Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Tanh()
+            out = m(x)
+            print(out.numpy())
+            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+    """
+
+    def __init__(self, name=None):
+        super(Tanh, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.tanh(x, self._name)
+
+
+class Hardtanh(layers.Layer):
+    """
+    Hardtanh Activation
+
+    .. math::
+
+        Hardtanh(x)= \\begin{cases}
+                        max, \\text{if } x > max \\\\
+                        min, \\text{if } x < min \\\\
+                        x,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        min (float, optional): The value of min for Hardtanh. Default is -1.
+        max (float, optional): The value of max for Hardtanh. Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            m = paddle.nn.Hardtanh()
+            out = m(x) # # [-1., 0.3, 1.]
+    """
+
+    def __init__(self, min=-1.0, max=1.0, name=None):
+        super(Hardtanh, self).__init__()
+        self._min = min
+        self._max = max
+        self._name = name
+
+    def forward(self, x):
+        return F.hardtanh(x, self._min, self._max, self._name)
 
 
 class HSigmoid(layers.Layer):
@@ -154,7 +392,7 @@ class HSigmoid(layers.Layer):
             [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
 
     def forward(self, input, label, path_table=None, path_code=None):
-        out = functional.hsigmoid(
+        out = F.hsigmoid(
             input,
             label,
             self.weight,
@@ -166,188 +404,660 @@ class HSigmoid(layers.Layer):
         return out
 
 
-class ReLU(layers.Layer):
+class PReLU(layers.Layer):
     """
-	:alias_main: paddle.nn.ReLU
-	:alias: paddle.nn.ReLU,paddle.nn.layer.ReLU,paddle.nn.layer.activation.ReLU
+    PReLU Activation.
+
+    .. math::
+
+        PReLU(x) = max(0, x) + weight * min(0, x)
+
+    Parameters:
+        num_parameters (int, optional): Number of `weight` to learn. The supported values are:
+            1 - a single parameter `alpha` is used for all input channels; 
+            Number of channels - a seperate `alpha` is used for each input channel.
+            Default is 1.
+        init (float, optional): Init value of learnable `weight`. Default is 0.25.
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`. 
+            Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape. Default dtype is float32.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            paddle.set_default_dtype("float64")
+
+            data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
+                            [ 3.0, -4.0,  5.0, -6.0],
+                            [-7.0, -8.0,  8.0,  9.0]],
+                            [[ 1.0, -2.0, -3.0,  4.0],
+                            [-5.0,  6.0,  7.0, -8.0],
+                            [ 6.0,  7.0,  8.0,  9.0]]]], 'float64')
+            x = paddle.to_tensor(data)
+            m = paddle.nn.PReLU(1, 0.25)
+            out = m(x)
+            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
+            #    [ 3.  , -1.  ,  5.  , -1.5 ],
+            #    [-1.75, -2.  ,  8.  ,  9.  ]],
+            #   [[ 1.  , -0.5 , -0.75,  4.  ],
+            #    [-1.25,  6.  ,  7.  , -2.  ],
+            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
+    """
+
+    def __init__(self, num_parameters=1, init=0.25, weight_attr=None,
+                 name=None):
+        super(PReLU, self).__init__()
+        self._num_parameters = num_parameters
+        self._init = init
+        self._weight_attr = weight_attr
+        self._name = name
+
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
+            is_bias=False,
+            default_initializer=Constant(self._init))
+
+    def forward(self, x):
+        return F.prelu(x, self._weight)
+
 
+class ReLU(layers.Layer):
+    """
     ReLU Activation.
 
-    .. math:
+    .. math::
 
-        out = max(x, 0)
+        ReLU(x) = max(x, 0)
 
     Parameters:
-        inplace (bool, optional): If inplace is True, the input and output of 
-            ``ReLU`` are the same variable. Otherwise, the input and output of
-            ``ReLU`` are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False.
-    
-    Returns:
-        None
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
     
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
 
-          data = np.array([-2, 0, 1]).astype('float32')
-          my_relu = nn.ReLU()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = my_relu(data)  # [0, 0, 1]
+            x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
+            m = paddle.nn.ReLU()
+            out = m(x) # [0., 0., 1.]
     """
 
-    def __init__(self, inplace=False):
+    def __init__(self, name=None):
         super(ReLU, self).__init__()
-        self._inplace = inplace
+        self._name = name
 
-    def forward(self, input):
-        return functional.relu(input, self._inplace)
+    def forward(self, x):
+        return F.relu(x, self._name)
 
 
-class LeakyReLU(layers.Layer):
+class ReLU6(layers.Layer):
     """
-	:alias_main: paddle.nn.LeakyReLU
-	:alias: paddle.nn.LeakyReLU,paddle.nn.layer.LeakyReLU,paddle.nn.layer.activation.LeakyReLU
+    ReLU6 Activation
+
+    .. math::
+
+        ReLU6(x) = min(max(0,x), 6)
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
+            m = paddle.nn.ReLU6()
+            out = m(x) # [0, 0.3, 6]
+    """
+
+    def __init__(self, name=None):
+        super(ReLU6, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.relu6(x, self._name)
+
+
+class SELU(layers.Layer):
+    """
+    SELU Activation
+
+    .. math::
+
+        SELU(x)= scale *
+                 \\begin{cases}
+                   x, \\text{if } x > 0 \\\\
+                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                 \\end{cases}
+
+    Parameters:
+        scale (float, optional): The value of scale for SELU. Default is 1.0507009873554804934193349852946
+        alpha (float, optional): The value of alpha for SELU. Default is 1.6732632423543772848170429916717
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
 
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
+            m = paddle.nn.SELU()
+            out = m(x) # [[0, 1.050701],[2.101402, 3.152103]]
+    """
+
+    def __init__(self,
+                 scale=1.0507009873554804934193349852946,
+                 alpha=1.6732632423543772848170429916717,
+                 name=None):
+        super(SELU, self).__init__()
+        self._scale = scale
+        self._alpha = alpha
+        self._name = name
+
+    def forward(self, x):
+        return F.selu(x, self._scale, self._alpha, self._name)
+
+
+class LeakyReLU(layers.Layer):
+    """
     Leaky ReLU Activation.
 
     .. math:
 
-        out = max(x, alpha * x)
+        LeakyReLU(x)=
+            \left\{
+            \begin{aligned}
+            &x, & & if \ x >= 0 \\
+            &negative\_slope * x, & & otherwise \\
+            \end{aligned}
+            \right. \\
 
     Parameters:
-        alpha (float, optional): Slope of the activation function at x < 0. Default: 0.01.
-        inplace (bool, optional): If inplace is True, the input and output of 
-            ``LeakyReLU`` are the same variable. Otherwise, the input and output of
-            ``LeakyReLU`` are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False. Default: False.
+        negative_slope (float, optional): Slope of the activation function at
+            :math:`x < 0` . Default is 0.01.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
     
-    Returns:
-        None
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
     
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import numpy as np
 
-          data = np.array([-2, 0, 1]).astype('float32')
-          lrelu = nn.LeakyReLU()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = lrelu(data)  # [-0.02, 0, 1]
+            paddle.disable_static()
+
+            m = paddle.nn.LeakyReLU()
+            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            out = m(x)  # [-0.02, 0., 1.]
     """
 
-    def __init__(self, alpha=1e-2, inplace=False):
+    def __init__(self, negative_slope=0.01, name=None):
         super(LeakyReLU, self).__init__()
-        self._alpha = alpha
-        self._inplace = inplace
+        self._negative_slope = negative_slope
+        self._name = name
 
-    def forward(self, input):
-        return functional.leaky_relu(input, self._alpha, self._inplace)
+    def forward(self, x):
+        return F.leaky_relu(x, self._negative_slope, self._name)
 
 
 class Sigmoid(layers.Layer):
     """
-	:alias_main: paddle.nn.Sigmoid
-	:alias: paddle.nn.Sigmoid,paddle.nn.layer.Sigmoid,paddle.nn.layer.activation.Sigmoid
+    this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
+    
+    .. math::
 
-    Sigmoid Activation.
+        Sigmoid(x) = \frac{1}{1 + e^{-x}}
     
-    .. math:
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        output = \frac{1}{1 + e^{-input}}
+    Shape:
+        x: N-D tensor, available dtype is float16, float32, float64.
 
-    Parameters:
-        inplace (bool, optional): If inplace is True, the input and output
-            are the same variable. Otherwise, the input and output
-            are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False.
-    
     Returns:
-        None
+        A callable object of Sigmoid.
     
     Examples:
+
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
           import numpy as np
-          input = fluid.data(name="input", shape=[None, 4])
-          output = nn.Sigmoid()(input)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
+          import paddle
+
+          paddle.disable_static()
           input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
-          output_data = exe.run(feed={"input": input_data},
-                                fetch_list=[output])
-          print(output_data) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+          m = paddle.nn.Sigmoid()
+          x = paddle.to_tensor(input_data)
+          output = m(x)
+          print(output.numpy()) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
     """
 
-    def __init__(self, inplace=False):
+    def __init__(self, name=None):
         super(Sigmoid, self).__init__()
-        self._inplace = inplace
+        self.name = name
 
-    def forward(self, input):
-        return functional.sigmoid(input, self._inplace)
+    def forward(self, x):
+        return F.sigmoid(x, self.name)
 
 
-class LogSoftmax(layers.Layer):
+class Softplus(layers.Layer):
+    """
+    Softplus Activation
+
+    .. math::
+
+        Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+
+    Parameters:
+        beta (float, optional): The value of beta for Softplus. Default is 1
+        threshold (float, optional): The value of threshold for Softplus. Default is 20
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Softplus()
+            out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+    """
+
+    def __init__(self, beta=1, threshold=20, name=None):
+        super(Softplus, self).__init__()
+        self._beta = beta
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.softplus(x, self._beta, self._threshold, self._name)
+
+
+class Softshrink(layers.Layer):
+    """
+    Softshrink Activation
+
+    .. math::
+
+        Softshrink(x)= \\begin{cases}
+                        x - threshold, \\text{if } x > threshold \\\\
+                        x + threshold, \\text{if } x < -threshold \\\\
+                        0,  \\text{otherwise}
+                      \\end{cases}
+
+    Parameters:
+        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            m = paddle.nn.Softshrink()
+            out = m(x) # [-0.4, 0, 0, 0.3]
     """
-	:alias_main: paddle.nn.LogSoftmax
-	:alias: paddle.nn.LogSoftmax,paddle.nn.layer.LogSoftmax,paddle.nn.layer.activation.LogSoftmax
 
+    def __init__(self, threshold=0.5, name=None):
+        super(Softshrink, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.softshrink(x, self._threshold, self._name)
+
+
+class Softsign(layers.Layer):
+    """
+    Softsign Activation
+
+    .. math::
+
+        Softsign(x) = \\frac{x}{1 + |x|}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Softsign()
+            out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+    """
+
+    def __init__(self, name=None):
+        super(Softsign, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.softsign(x, self._name)
+
+
+class Tanhshrink(layers.Layer):
+    """
+    Tanhshrink Activation
+
+    .. math::
+
+        Tanhshrink(x) = x - tanh(x)
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            m = paddle.nn.Tanhshrink()
+            out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+    """
+
+    def __init__(self, name=None):
+        super(Tanhshrink, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.tanhshrink(x, self._name)
+
+
+class LogSigmoid(layers.Layer):
+    """
+    LogSigmoid Activation.
+    
+    .. math::
+
+        LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, or float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            m = paddle.nn.LogSigmoid()
+            out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+    """
+
+    def __init__(self, name=None):
+        super(LogSigmoid, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.logsigmoid(x, self._name)
+
+
+class Softmax(layers.Layer):
+    """
+    Softmax Activation.
+
+    This operator implements the softmax layer. The calculation process is as follows:
+
+    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
+
+    2. Then ``x`` will be logically flattened to a 2-D matrix. The matrix's second
+    dimension(row length) is the same as the dimension :attr:`axis` of ``x``,
+    and the first dimension(column length) is the product of all other dimensions
+    of ``x``. For each row of the matrix, the softmax operator squashes the
+    K-dimensional(K is the width of the matrix, which is also the size of ``x``'s
+    dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional
+    vector of real values in the range [0, 1] that add up to 1.
+
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
+    are performed to restore the two-dimensional matrix to the same dimension as the ``x`` .
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
+
+    .. math::
+
+        Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+
+    Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+
+          Attrs:
+            axis = -1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+
+        Case 2:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+          Attrs:
+            axis = 1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
+                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
+                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
+                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
+                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
+
+    Parameters:
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            :math:`axis + D` . Default is -1.
+        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
+            type of the output tensor. If dtype is specified, ``x`` is casted
+            to ``dtype`` before the operation is performed. This is useful for 
+            preventing data type overflows. Supported dtype: float32, float64.
+            If ``dtype`` is None, the output Tensor has the same dtype as x.
+            Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                        [3.0, 4.0, 5.0, 6.0],
+                        [7.0, 8.0, 8.0, 9.0]],
+                        [[1.0, 2.0, 3.0, 4.0],
+                        [5.0, 6.0, 7.0, 8.0],
+                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Softmax()
+            out = m(x)
+            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+    """
+
+    def __init__(self, axis=-1, name=None):
+        super(Softmax, self).__init__()
+        self._axis = axis
+        self._dtype = None
+        self._name = name
+
+    def forward(self, x):
+        return F.softmax(x, self._axis, self._dtype, self._name)
+
+
+class LogSoftmax(layers.Layer):
+    """
     This operator implements the log_softmax layer. The calculation process is as follows:
 
     .. math::
 
         Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
+                  = log(\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
 
     Parameters:
-        axis (int, optional): The index of dimension to perform softmax calculations, it should be in
-            range :math:`[-1, rank-1]`, while :math:`rank` is the rank of input variable. Default: None. 
-            None and -1 means the last dimension.
-        dtype (np.dtype|core.VarDesc.VarType|str): The desired data type of returned tensor. If specified,
-            the input tensor is casted to dtype before the operation is performed. This is useful for
-            preventing data type overflows. Default: None. Supported dtype: float32 or float64
+        axis (int, optional): The axis along which to perform log_softmax
+            calculations. It should be in range [-D, D), where D is the
+            dimensions of the input Tensor . If ``axis`` < 0, it works the
+            same way as :math:`axis + D` . Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
  
-    Returns:
-        None
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
+        import paddle
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = np.array([[[-2.0, 3.0, -4.0, 5.0],
+                        [3.0, -4.0, 5.0, -6.0],
+                        [-7.0, -8.0, 8.0, 9.0]],
+                        [[1.0, -2.0, -3.0, 4.0],
+                        [-5.0, 6.0, 7.0, -8.0],
+                        [6.0, 7.0, 8.0, 9.0]]])
+        m = paddle.nn.LogSoftmax()
+        x = paddle.to_tensor(x)
+        out = m(x)
+        # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
+        #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
+        #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
+        #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
+        #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
+        #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+    """
 
-          data = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                           [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]]).astype('float32')
-          my_log_softnmax = nn.LogSoftmax()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = my_log_softnmax(data)
-              # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-              #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-              #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-              #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-              #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-              #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
-    """
-
-    def __init__(self, axis=None):
+    def __init__(self, axis=-1, name=None):
         super(LogSoftmax, self).__init__()
         self._axis = axis
+        self._name = name
 
-    def forward(self, input):
-        return functional.log_softmax(input, self._axis)
+    def forward(self, x):
+        return F.log_softmax(x, self._axis)
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 8125e528b195b28024915ed9c20b922bd6224a5e..8a73cfb8ccda15505d8668eff6776aac387c134f 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -12,17 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the common classes to build a neural network  
+# TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
 from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
 from ...fluid.dygraph import Linear  #DEFINE_ALIAS
+from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
+from ...fluid.framework import _dygraph_tracer
 
 __all__ = [
-    'BilinearTensorProduct', 'Pool2D', 'Embedding', 'Linear', 'UpSample',
-    'Pad2D'
+    'BilinearTensorProduct',
+    'Pool2D',
+    'Embedding',
+    'Linear',
+    'UpSample',
+    'Pad2D',
+    'ReflectionPad1d',
+    'ReplicationPad1d',
+    'ConstantPad1d',
+    'ReflectionPad2d',
+    'ReplicationPad2d',
+    'ConstantPad2d',
+    'ZeroPad2d',
+    'ConstantPad3d',
+    'ReplicationPad3d',
+    'CosineSimilarity',
+    'Dropout',
+    'Dropout2D',
+    'Dropout3D',
+    'Bilinear',
+    'AlphaDropout',
 ]
 
 
@@ -257,12 +278,10 @@ class Pad2D(layers.Layer):
     """
         :alias_main: paddle.nn.Pad2D
         :alias: paddle.nn.Pad2D,paddle.nn.layer.Pad2D,paddle.nn.layer.common.Pad2D
-
     This interface is used to construct a callable object of the ``Pad2D``  class.
     The Pad2D layer pads the input tensor boundaries according to 'paddings' and 'mode'.
     If mode is 'reflect', paddings[0] and paddings[1] must be no greater
     than height-1. And the width dimension has the same condition.
-
     Parameters:
         paddings (int | List[int32]): The padding size. If padding is a int, uses the same 
             padding in all boundaries, if padding is a List, it must contain four integers, 
@@ -277,16 +296,12 @@ class Pad2D(layers.Layer):
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
                            Default is  "NCHW"
-
     Returns: 
         None
-
     Examples:
         .. code-block:: text
-
             Input = [[[[1., 2., 3.],
                        [4., 5., 6.]]]]
-
             Case 0:
                 paddings = [0, 1, 2, 3],
                 mode = 'constant'
@@ -294,24 +309,20 @@ class Pad2D(layers.Layer):
                 Out = [[[[0., 0., 1., 2., 3., 0., 0., 0.],
                          [0., 0., 4., 5., 6., 0., 0., 0.],
                          [0., 0., 0., 0., 0., 0., 0., 0.]]]]
-
             Case 1:
                 paddings = [0, 1, 2, 1],
                 mode = 'reflect'
                 Out = [[[[3., 2., 1., 2., 3., 2.],
                          [6., 5., 4., 5., 6., 5.],
                          [3., 2., 1., 2., 3., 2.]]]]
-
             Case 2:
                 paddings = [0, 1, 2, 1],
                 mode = 'edge'
                 Out = [[[[1., 1., 1., 2., 3., 3.],
                          [4., 4., 4., 5., 6., 6.],
                          [4., 4., 4., 5., 6., 6.]]]]
-
     Code Examples:
         .. code-block:: python
-
             import paddle.fluid as fluid
             import paddle.nn as nn
             import numpy as np
@@ -341,3 +352,942 @@ class Pad2D(layers.Layer):
             mode=self._mode,
             pad_value=self._pad_value,
             data_format=self._data_format)
+
+
+class Bilinear(layers.Layer):
+    """
+
+    This layer performs bilinear on two inputs.
+
+    .. math::
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+      out = out + b
+
+    In this formula:
+     - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
+     - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
+     - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`b`: the learned bias, shape is [1, out_features].
+     - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
+
+    Parameters:
+       in1_features (int): The dimension of each first input(`x1`).
+       in2_features (int): The dimension of each second input(`x2`).
+       out_features (int): The dimension of output of this layer.
+       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
+       this layer. The default value is None.
+       bias_attr (ParamAttr, optional): The parameter attribute for the bias
+           of this layer. If it is set to False, no bias will be added to the output units.
+           If it is set to None, the bias is initialized zero. The default value is None.       
+       name (str, optional): The default value is None. Normally there is no need for user
+           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of this layer.
+
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Returns:
+       Variable: A 2-D Tensor of shape [batch_size, out_features].
+
+    Examples:
+       .. code-block:: python
+
+        import paddle
+        import numpy
+
+        paddle.disable_static()
+        layer1 = numpy.random.random((5, 5)).astype('float32')
+        layer2 = numpy.random.random((5, 4)).astype('float32')
+        bilinear = paddle.nn.Bilinear(
+            in1_features=5, in2_features=4, out_features=1000)
+        result = bilinear(paddle.to_tensor(layer1),
+                        paddle.to_tensor(layer2))     # result shape [5, 1000]
+
+    """
+
+    def __init__(self,
+                 in1_features,
+                 in2_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(Bilinear, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._name = name
+        self._in1_features = in1_features
+        self._in2_features = in2_features
+        self._out_features = out_features
+        self._dtype = self._helper.get_default_dtype()
+
+        weight_shape = [
+            self._out_features, self._in1_features, self._in2_features
+        ]
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=weight_shape,
+            dtype=self._dtype,
+            is_bias=False)
+        bias_shape = [1, self._out_features]
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=bias_shape,
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, x1, x2):
+        return F.bilinear(x1, x2, self.weight, self.bias, self._name)
+
+
+class Dropout(layers.Layer):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training as described in the paper:
+    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_ 
+    The dropout operator randomly sets the outputs of some units to zero, while upscale others
+    according to the given dropout probability.
+
+    See ``paddle.nn.functional.dropout`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        axis (int | list): The axis along which the dropout is performed. Default None.
+        mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: N-D tensor.
+        - output: N-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
+        super(Dropout, self).__init__()
+
+        self.p = p
+        self.axis = axis
+        self.mode = mode
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout(
+            input,
+            p=self.p,
+            axis=self.axis,
+            training=self.training,
+            mode=self.mode,
+            name=self.name)
+        return out
+
+
+class Dropout2D(layers.Layer):
+    """
+    Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
+    a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+    Dropout2d will help promote independence between feature maps as described in the paper: 
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+
+    See ``paddle.nn.functional.dropout2d`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float, optional): Probability of setting units to zero. Default: 0.5
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCHW`, `NHWC`. The default is `NCHW`. When it is `NCHW`, the data is
+                                     stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: 4-D tensor.
+        - output: 4-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout2D(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, data_format='NCHW', name=None):
+        super(Dropout2D, self).__init__()
+
+        self.p = p
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout2d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name)
+        return out
+
+
+class Dropout3D(layers.Layer):
+    """
+    Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
+    a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
+    on every forward call with probability `p` using samples from a Bernoulli distribution.
+    Dropout3d will help promote independence between feature maps as described in the paper: 
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+
+    See ``paddle.nn.functional.dropout3d`` for more details.
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+                                     will be consistent with that of the input. An optional string from:
+                                    `NCDHW`, `NDHWC`. The default is `NCDHW`. When it is `NCDHW`, the data is
+                                     stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: 5-D tensor.
+        - output: 5-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.Dropout3D(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, data_format='NCDHW', name=None):
+        super(Dropout3D, self).__init__()
+
+        self.p = p
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.dropout3d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name)
+        return out
+
+
+class AlphaDropout(layers.Layer):
+    """
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing property. For an input with
+    zero mean and unit standard deviation, the output of Alpha Dropout maintains the original mean and
+    standard deviation of the input. Alpha Dropout fits well to SELU activate function by randomly setting
+    activations to the negative saturation value.
+
+    For more information, please refer to:
+    `Self-Normalizing Neural Networks <https://arxiv.org/abs/1706.02515>`_
+
+    In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
+
+    Parameters:
+        p (float | int): Probability of setting units to zero. Default: 0.5
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: N-D tensor.
+        - output: N-D tensor, the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
+            x = paddle.to_tensor(x)
+            m = paddle.nn.AlphaDropout(p=0.5)
+            y_train = m(x)
+            m.eval()  # switch the model to test phase
+            y_test = m(x)
+            print(x.numpy())
+            print(y_train.numpy())
+            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
+            print(y_test.numpy())
+   """
+
+    def __init__(self, p=0.5, name=None):
+        super(AlphaDropout, self).__init__()
+        self.p = p
+        self.name = name
+
+    def forward(self, input):
+        out = F.alpha_dropout(
+            input, p=self.p, training=self.training, name=self.name)
+        return out
+
+
+class ReflectionPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReflectionPad1d`` class.
+    Uses reflection of the input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            Out = [[[2. 1. 2. 3. 2. 1.]
+                    [5. 4. 5. 6. 5. 4.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReflectionPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[2. 1. 2. 3. 2. 1.]
+            #   [5. 4. 5. 6. 5. 4.]]]
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super(ReflectionPad1d, self).__init__()
+        self._mode = "reflect"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad1d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            Out = [[[2. 1. 2. 3. 2. 1.]
+                    [5. 4. 5. 6. 5. 4.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[1. 1. 2. 3. 3. 3.]
+            #   [1. 4. 5. 6. 6. 6.]]]
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super(ReplicationPad1d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad1d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[1., 2., 3.],
+                  [4., 5., 6.]]]
+            padding = [1, 2],
+            value = 0.0
+            Out = [[[0. 1. 2. 3. 0. 0.]
+                    [0. 4. 5. 6. 0. 0.]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 2, 3)
+            pad = [1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad1d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[0. 1. 2. 3. 0. 0.]
+            #   [0. 4. 5. 6. 0. 0.]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCL", name=None):
+        super(ConstantPad1d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad2d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            value = 0.0
+            Out = [[[[0. 1. 2. 3. 0.]
+                     [0. 4. 5. 6. 0.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[0. 0. 0. 0.]
+            #    [0. 1. 2. 3.]
+            #    [0. 4. 5. 6.]
+            #    [0. 0. 0. 0.]
+            #    [0. 0. 0. 0.]]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCHW", name=None):
+        super(ConstantPad2d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ZeroPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad2d`` class.
+    Uses 0 to pad the input tensor.
+
+    Parameters:
+        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[0. 1. 2. 3. 0.]
+                     [0. 4. 5. 6. 0.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ZeroPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[0. 0. 0. 0.]
+            #    [0. 1. 2. 3.]
+            #    [0. 4. 5. 6.]
+            #    [0. 0. 0. 0.]
+            #    [0. 0. 0. 0.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ZeroPad2d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad2d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[1. 1. 2. 3. 3.]
+                     [4. 4. 5. 6. 6.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 2, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[1. 1. 2. 3.]
+            #    [1. 1. 2. 3.]
+            #    [4. 4. 5. 6.]
+            #    [4. 4. 5. 6.]
+            #    [4. 4. 5. 6.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ReplicationPad2d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReflectionPad2d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReflectionPad2d`` class.
+    Uses reflection of the input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
+           Default is  "NCHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[1., 2., 3.],
+                   [4., 5., 6.]]]]
+            padding = [1, 1, 0, 0]
+            Out = [[[[2. 1. 2. 3. 2.]
+                     [5. 4. 5. 6. 5.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 4, 3)
+            pad = [1, 0, 1, 2]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReflectionPad2d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[ 5.  4.  5.  6.]
+            #    [ 2.  1.  2.  3.]
+            #    [ 5.  4.  5.  6.]
+            #    [ 8.  7.  8.  9.]
+            #    [11. 10. 11. 12.]
+            #    [ 8.  7.  8.  9.]
+            #    [ 5.  4.  5.  6.]]]]
+    """
+
+    def __init__(self, padding, data_format="NCHW", name=None):
+        super(ReflectionPad2d, self).__init__()
+        self._mode = "reflect"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ConstantPad3d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConstantPad3d`` class.
+    Uses a constant value to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        value (float32): The value to fill the padded areas. Default is 0.0
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+            padding = [1, 2, 0, 0, 0, 0]
+            value = 0.0
+            Out = [[[[[0. 1. 2. 3. 0. 0.]
+                      [0. 4. 5. 6. 0. 0.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 1, 2, 3)
+            pad = [1, 0, 1, 2, 0, 0]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ConstantPad3d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[[0. 0. 0. 0.]
+            #     [0. 1. 2. 3.]
+            #     [0. 4. 5. 6.]
+            #     [0. 0. 0. 0.]
+            #     [0. 0. 0. 0.]]]]]
+    """
+
+    def __init__(self, padding, value=0.0, data_format="NCDHW", name=None):
+        super(ConstantPad3d, self).__init__()
+        self._mode = "constant"
+        self._data_format = data_format
+        self._pad = padding
+        self._value = value
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     value=self._value,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class ReplicationPad3d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ReplicationPad3d`` class.
+    Uses input boundaries to pad the input tensor.
+
+    Parameters:
+        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
+            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            x = [[[[[1., 2., 3.],
+                    [4., 5., 6.]]]]]
+            padding = [1, 2, 0, 0, 0, 0]
+            Out = [[[[[1. 1. 2. 3. 3. 3.]
+                      [4. 4. 5. 6. 6. 6.]]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            input_shape = (1, 1, 1, 2, 3)
+            pad = [1, 0, 1, 2, 0, 0]
+            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
+            my_pad = nn.ReplicationPad3d(padding=pad)
+            data = paddle.to_tensor(data)
+            result = my_pad(data)
+            print(result.numpy())
+            # [[[[[1. 1. 2. 3.]
+            #     [1. 1. 2. 3.]
+            #     [4. 4. 5. 6.]
+            #     [4. 4. 5. 6.]
+            #     [4. 4. 5. 6.]]]]]
+    """
+
+    def __init__(self, padding, data_format="NCDHW", name=None):
+        super(ReplicationPad3d, self).__init__()
+        self._mode = "replicate"
+        self._data_format = data_format
+        self._pad = padding
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(x,
+                     pad=self._pad,
+                     mode=self._mode,
+                     data_format=self._data_format,
+                     name=self._name)
+
+
+class CosineSimilarity(layers.Layer):
+    """
+    This interface is used to compute cosine similarity between x1 and x2 along axis.
+
+    Parameters:
+        axis (int): Dimension of vectors to compute cosine similarity. Default is 1.
+        eps(float): Small value to avoid division by zero. Default is 1e-8.
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: text
+
+            Case 0:
+                x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
+                     [0.48949873 0.5797396  0.65444374 0.66510963]
+                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
+                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                     [0.4427798  0.54136837 0.5276275  0.32394758]
+                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                axis = 1
+                eps = 1e-8
+                Out: [0.5275037  0.8368967  0.75037485 0.9245899]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+            paddle.disable_static()
+
+            np.random.seed(0)
+            x1 = np.random.rand(2,3)
+            x2 = np.random.rand(2,3)
+            x1 = paddle.to_tensor(x1)
+            x2 = paddle.to_tensor(x2)
+
+            cos_sim_func = nn.CosineSimilarity(axis=0)
+            result = cos_sim_func(x1, x2)
+            print(result.numpy())
+            # [0.99806249 0.9817672  0.94987036]
+    """
+
+    def __init__(self, axis=1, eps=1e-8):
+        super(CosineSimilarity, self).__init__()
+        self._axis = axis
+        self._eps = eps
+
+    def forward(self, x1, x2):
+        return F.cosine_similarity(x1, x2, axis=self._axis, eps=self._eps)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 9fb6c9ebc2e404ab477630aae99a6b43d683b20b..7d0e59fb7575c9d15d28e88a462aed4ddba47fb9 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -15,12 +15,12 @@
 # TODO: define classes of convolutional neural network
 
 __all__ = [
-    'Conv2D',
-    'Conv2DTranspose',
-    'Conv3D',
-    'Conv3DTranspose',
-    #       'TreeConv',
-    #       'Conv1D'
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
 ]
 
 import numpy as np
@@ -38,12 +38,270 @@ def _get_default_param_initializer(num_channels, filter_size):
     return Normal(0.0, std, 0)
 
 
-class Conv2D(layers.Layer):
+def _reverse_repeat_list(t, n):
+    """Reverse the order of `t` and repeat each element for `n` times.
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
     """
-	:alias_main: paddle.nn.Conv2D
-	:alias: paddle.nn.Conv2D,paddle.nn.layer.Conv2D,paddle.nn.layer.conv.Conv2D
+    return list(x for x in reversed(t) for _ in range(n))
 
-    This interface is used to construct a callable object of the ``Conv2D`` class.
+
+class _ConvNd(layers.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 transposed,
+                 dims,
+                 stride=1,
+                 padding=0,
+                 padding_mode='zeros',
+                 output_padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(_ConvNd, self).__init__()
+        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        self._param_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._groups = groups
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._data_format = data_format
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                "padding_mode must be one of {}, but got padding_mode='{}'".
+                format(valid_padding_modes, padding_mode))
+
+        if padding_mode in {'reflect', 'replicate', 'circular'
+                            } and not isinstance(padding, np.int):
+            raise TypeError(
+                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
+            )
+
+        self._stride = utils.convert_to_list(stride, dims, 'stride')
+        self._dilation = utils.convert_to_list(dilation, dims, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, dims,
+                                                  'kernel_size')
+        self._padding = padding
+        self._padding_mode = padding_mode
+        self.output_padding = output_padding
+
+        if transposed:
+            filter_shape = [self._in_channels, out_channels // groups
+                            ] + self._kernel_size
+        else:
+            if in_channels % groups != 0:
+                raise ValueError("in_channels must be divisible by groups.")
+
+            if padding_mode in {'reflect', 'replicate', 'circular'}:
+                _paired_padding = utils.convert_to_list(padding, 2, 'padding')
+                self._reversed_padding_repeated_twice = _reverse_repeat_list(
+                    _paired_padding, 2)
+
+            filter_shape = [out_channels, in_channels // groups
+                            ] + self._kernel_size
+
+        self.weight = self.create_parameter(
+            shape=filter_shape, attr=self._param_attr)
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
+
+
+class Conv1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``Conv1d`` class.
+    For more details, refer to code examples.
+    The convolution1D layer calculates the output based on the input, filter
+    and stride, padding, dilation, groups parameters. Input and
+    Output are in NCL format or NLC format, where N is batch size, C is the number of
+    the feature map, L is the length of the feature map.
+    Filter's shape is [MCK] , where M is the number of output feature map,
+    C is the number of input feature map, K is the size of the kernel. 
+    If the groups is greater than 1, C will equal the number of input feature map divided by the groups.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \\sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a ``Tensor`` with 'NCL' format or 'NLC' format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCK] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, L_{in})`
+          Kernel shape: :math:`(C_{out}, C_{in}, K)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, L_{out})`
+        Where
+        .. math::
+            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+    Parameters:
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of filter. It is as same as the output
+            feature map.
+        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple,
+            it must contain one integer, (kernel_size).
+        stride (int|tuple|list, optional): The stride size. If stride is a tuple, it must
+            contain one integer, (stride_size). Default: 1.
+        padding(int|str|tuple|list, optional): The size of zeros to be padded. It must be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
+            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
+            The default value is 0.
+        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple, it must
+            contain one integer, (dilation_size). Default: 1.
+        groups (int, optional): The groups number of the conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        padding_mode(str, optional): Four modes: 'zeros', 'reflect', 'replicate', 'circular'.
+            When in 'zeros' mode, this op uses zeros to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'zeros'.
+        bias(bool, optional): Whether to use bias. Default: True.
+        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv1d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+    Attribute:
+        **weight** (Parameter): the learnable weights of filter of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Shape:
+        - x: 3-D tensor with shape: (batch, in_channels, length) or (batch, length, in_channels).
+        - output: 3-D tensor with same shape as input x.
+    
+    Raises:
+        None
+    Examples:
+        .. code-block:: python
+          import paddle
+          from paddle.nn import Conv1d
+          import numpy as np
+          x = np.array([[[4, 8, 1, 9],
+            [7, 2, 0, 9],
+            [6, 9, 2, 6]]]).astype(np.float32)
+          w=np.array(
+          [[[9, 3, 4],
+            [0, 0, 7],
+            [2, 5, 6]],
+           [[0, 3, 4],
+            [2, 9, 7],
+            [5, 6, 8]]]).astype(np.float32)
+          paddle.disable_static()
+          x_t = paddle.to_tensor(x)
+          conv = Conv1d(3, 2, 3)
+          conv.weight.set_value(w)
+          y_t = conv(x_t)
+          y_np = y_t.numpy()
+          print(y_np)
+          # [[[133. 238.]
+          #   [160. 211.]]]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 bias=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCL",
+                 name=None):
+        super(Conv1d, self).__init__()
+        assert weight_attr is not False, "param_attr should not be False here."
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._groups = groups
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups.")
+        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
+        self._stride = utils.convert_to_list(stride, 1, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
+        self._padding = padding  # leave it to F.conv1d
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._data_format = data_format
+        self._name = name
+
+        self._padding_mode = padding_mode
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(
+                "padding_mode must be one of {}, but got padding_mode='{}'".
+                format(valid_padding_modes, padding_mode))
+
+        if padding_mode in {'reflect', 'replicate', 'circular'
+                            } and not isinstance(padding, np.int):
+            raise ValueError(
+                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
+            )
+        if not isinstance(padding, str):
+            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
+
+        num_filter_channels = in_channels // groups
+        filter_shape = [self._out_channels, num_filter_channels
+                        ] + self._kernel_size
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=filter_shape,
+            default_initializer=_get_default_param_initializer(
+                self._in_channels, filter_shape))
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels],
+            is_bias=True) if bias else None
+
+    def forward(self, x):
+        padding = 0
+        if self._padding_mode != "zeros":
+            x = F.pad(x,
+                      self._padding,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+        else:
+            padding = self._padding
+
+        out = F.conv1d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            name=self._name)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -59,48 +317,23 @@ class Conv2D(layers.Layer):
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \\sigma (W \\ast X + b)
-
     Where:
-
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output
-            feature map.
-        filter_size (int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by convolution.
+        kernel_size (int|list|tuple): The size of convolution kernel.
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
@@ -108,10 +341,8 @@ class Conv2D(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` .
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups (int, optional): The groups number of the Conv2d Layer. According to grouped
@@ -119,129 +350,287 @@ class Conv2D(layers.Layer):
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
+        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filter of this layer.
-
         **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-    
-    Raises:
-        ValueError: if ``use_cudnn`` is not a bool value.
-
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel_size[0] - 1) + 1))}{strides[0]} + 1 \\\\
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel_size[1] - 1) + 1))}{strides[1]} + 1
     Examples:
         .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2D(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 6, 6)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class ConvTranspose1d(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
+    For more details, refer to code examples.
+    The 1-D convolution transpose layer calculates the output based on the input,
+    filter, and dilation, stride, padding. Input(Input) and output(Output)
+    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
+    L is the length of the feature. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a 3-D Tensor with 'NCL' format or 'NLC' format.
+    * :math:`W`: Kernel value, a 3-D Tensor with 'MCK' format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, a 3-D Tensor with data format 'NCL' of 'NLC', the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, L_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, L_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, L_{out})`
+
+        Where
+
+        .. math::
+
+           L^\prime_{out} &= (L_{in} - 1) * stride - pad_top - pad_bottom + dilation * (L_f - 1) + 1 \\\\
+           L_{out} &\in [ L^\prime_{out}, L^\prime_{out} + stride ]
+
+    Note:
+          The conv1d_transpose can be seen as the backward of the conv1d. For conv1d,
+          when stride > 1, conv1d maps multiple input shape to the same output shape,
+          so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
+          If output_size is None, :math:`L_{out} = L^\prime_{out}`;
+          else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
+          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+
+    Args:
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of the filter. It is as same as the output
+            feature map.
+        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple,
+            it must contain one integers, (kernel_size). None if
+            use output size to calculate kernel_size. Default: None. kernel_size and
+            output_size should not be None at the same time.
+        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain one integer, (stride_size).
+            Default: stride = 1.
+        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
+             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
+             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
+             If `padding` is a tuple or list, it could be in two forms:
+             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
+        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
+             If it is a tuple, it must contain one integer. Default: 0.
+        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        bias(bool, optional): Whether to use bias. Default: True.
+        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a tuple, it must contain one integer, (dilation_size).
+            Default: dilation = 1.
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv1d_transpose. If it is set to None or one attribute of ParamAttr, conv1d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv1d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv1d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of filters of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+        - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is
+            "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
+        - output_size(int|tuple|list, optional): The output image size. If output size is a
+            tuple, it must contain one integer, (feature_length). None if use
+            kernel_size, padding, output_padding and stride to calculate output_size.
+            If output_size and kernel_size are specified at the same time, They
+            should follow the formula above. Default: None. output_size and kernel_size
+            should not be None at the same time.
+        - output(Tensor): 3-D tensor with same shape as input x.
+
+    Examples:
+       .. code-block:: python
+
+          import paddle
+          from paddle.nn import ConvTranspose1d
+          import numpy as np
+          
+          paddle.disable_static()
+          # shape: (1, 2, 4)
+          x=np.array([[[4, 0, 9, 7],
+                       [8, 0, 9, 2]]]).astype(np.float32)
+          # shape: (2, 1, 2)
+          y=np.array([[[7, 0]],
+                      [[4, 2]]]).astype(np.float32)
+          x_t = paddle.to_tensor(x)
+          conv = ConvTranspose1d(2, 1, 2)
+          conv.weight.set_value(y)
+          y_t = conv(x_t)
+          y_np = y_t.numpy()
+          print y_np
+          
+          # [[[60. 16. 99. 75.  4.]]]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 output_padding=0,
+                 groups=1,
+                 bias=True,
+                 dilation=1,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
+                 data_format="NCL"):
+        super(ConvTranspose1d, self).__init__()
+        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
+        self._param_attr = weight_attr
+        self._bias_attr = bias_attr
         self._groups = groups
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        self._act = act
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._output_padding = output_padding
         self._data_format = data_format
-        self._dtype = dtype
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        channel_last = (data_format == "NHWC")
-        self._padding = padding  # leave it to F.conv2d
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
+        self._bias = bias
 
-        num_filter_channels = num_channels // groups
-        filter_shape = [self._num_filters, num_filter_channels
-                        ] + self._filter_size
+        self._stride = utils.convert_to_list(stride, 1, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
+        self._padding = padding
 
+        filter_shape = [self._in_channels, out_channels // groups
+                        ] + self._kernel_size
         self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, filter_shape))
+            shape=filter_shape, attr=self._param_attr)
         self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+            attr=self._bias_attr, shape=[self._out_channels],
+            is_bias=True) if self._bias else None
 
-    def forward(self, input):
-        out = F.conv2d(
-            input,
+    def forward(self, x, output_size=None):
+        out = F.conv_transpose1d(
+            x,
             self.weight,
             bias=self.bias,
+            output_size=output_size,
+            output_padding=self._output_padding,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
             data_format=self._data_format)
         return out
 
 
-class Conv2DTranspose(layers.Layer):
+class ConvTranspose2d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv2DTranspose
-	:alias: paddle.nn.Conv2DTranspose,paddle.nn.layer.Conv2DTranspose,paddle.nn.layer.conv.Conv2DTranspose
-
-    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
+    This interface is used to construct a callable object of the ``ConvTranspose2d`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input and output
@@ -256,55 +645,36 @@ class Conv2DTranspose(layers.Layer):
     is applied to the final result.
     The details of convolution transpose layer, please refer to the following explanation and references
     `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     Where:
-
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
     Example:
-
         - Input:
-
           Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
           Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
         - Output:
-
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
         Where
-
         .. math::
-
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
            H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            feature map.
-        filter_size(int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by the convolution.
+        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+            it must contain two integers, (kernel_size_H, kernel_size_W).
+            Otherwise, the kernel will be a square.
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
@@ -312,10 +682,10 @@ class Conv2DTranspose(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: 1.
-        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
@@ -324,125 +694,94 @@ class Conv2DTranspose(layers.Layer):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
+        bias_attr (ParamAttr|bool, optional): The attribute for the bias of conv2d_transpose.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
     Examples:
        .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2DTranspose(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.ConvTranspose2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 10, 10)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2DTranspose, self).__init__()
-        assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._groups = groups
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._data_format = data_format
-        self._dtype = dtype
-
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+                 data_format="NCHW"):
+        super(ConvTranspose2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            2,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x, output_size=None):
         if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 2,
-                                                      'output_size')
+            output_padding = self.output_padding
         else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-        self._padding = padding
+            output_padding = 0
 
-        filter_shape = [self._num_channels, num_filters // groups
-                        ] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = F.conv2d_transpose(
-            input,
+        out = F.conv_transpose2d(
+            x,
             self.weight,
             bias=self.bias,
-            output_size=self._output_size,
             padding=self._padding,
+            output_padding=output_padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
+            output_size=output_size,
             data_format=self._data_format)
         return out
 
 
-class Conv3D(layers.Layer):
+class Conv3d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv3D
-	:alias: paddle.nn.Conv3D,paddle.nn.layer.Conv3D,paddle.nn.layer.conv.Conv3D
-
-    **Convlution3D Layer**
-
-    The convolution3D layer calculates the output based on the input, filter
+    **Convlution3d Layer**
+    The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
     Output(Output) are multidimensional tensors with a shape of 
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
@@ -451,49 +790,21 @@ class Conv3D(layers.Layer):
     but adds one dimension(depth). If bias attribution and activation type are
     provided, bias is added to the output of the convolution, and the
     corresponding activation function is applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     In the above equation:
-
     * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output image channel.
-        filter_size (int|tuple, optional): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square, filter_size_depth = filter_size_height
-            = filter_size_width = filter_size.
-        stride (int|tuple, optional): The stride size. If stride is a tuple, it must
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size (int|list|tuple, optional): The size of the convolving kernel.
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -503,7 +814,7 @@ class Conv3D(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups (int, optional): The groups number of the Conv3d Layer. According to grouped
@@ -511,7 +822,8 @@ class Conv3D(layers.Layer):
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
@@ -521,120 +833,97 @@ class Conv3D(layers.Layer):
             If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
+    Shape:
+        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
-
     Examples:
         .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3D(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = dg.to_variable(x)
+          conv = nn.Conv3d(4, 6, (3, 3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 6, 6, 6)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  padding=0,
                  stride=1,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 padding_mode='zeros',
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._act = act
-        self._use_cudnn = use_cudnn
-        self._dtype = dtype
-        self._data_format = data_format
-
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-        filter_shape = [num_filters, num_filter_channels] + self._filter_size
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, self._filter_size))
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+                 data_format="NCDHW"):
+        super(Conv3d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            3,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv3d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
 
-    def forward(self, input):
         out = F.conv3d(
-            input,
+            x,
             self.weight,
             bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
             data_format=self._data_format)
         return out
 
 
-class Conv3DTranspose(layers.Layer):
+class ConvTranspose3d(_ConvNd):
     """
-	:alias_main: paddle.nn.Conv3DTranspose
-	:alias: paddle.nn.Conv3DTranspose,paddle.nn.layer.Conv3DTranspose,paddle.nn.layer.conv.Conv3DTranspose
-
     **Convlution3D transpose layer**
-
     The convolution3D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW format. Where N is batch size, C is the number of channels,
@@ -646,70 +935,38 @@ class Conv3DTranspose(layers.Layer):
     If bias attribution and activation type are provided, bias is added to
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
-
     For each input :math:`X`, the equation is:
-
     .. math::
-
         Out = \sigma (W \\ast X + b)
-
     In the above equation:
-
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
     Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
-           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
-
     **Note**:
-
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
+          The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
-          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
+          so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
           H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
-
-
+          conv_transpose3d can compute the kernel size automatically.
     Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        filter_size(int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
+        in_channels(int): The number of channels in the input image.
+        out_channels(int): The number of channels produced by the convolution.
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+            it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
+            Otherwise, the kernel will be a square.
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
@@ -717,11 +974,9 @@ class Conv3DTranspose(layers.Layer):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            The default value is 1.
-        dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
+        output_padding(int|list|tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
@@ -730,7 +985,7 @@ class Conv3DTranspose(layers.Layer):
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. The default value is None.
@@ -739,109 +994,86 @@ class Conv3DTranspose(layers.Layer):
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
+        output_size(int|list|tuple, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). None if use
+            filter_size, padding, and stride to calculate output_size.
+            if output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None.
         data_format (str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-
     Attribute:
         **weight** (Parameter): the learnable weights of filters of this layer.
-
         **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-        None.
-
+    Shape:
+        - x: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel_size[0] - 1) + 1 \\\\
+           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel_size[1] - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel_size[2] - 1) + 1 \\\\
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
-
     Examples:
        .. code-block:: python
-
           import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-
+          import paddle
+          import paddle.nn as nn
           x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
           
           # (2, 6, 10, 10, 10)
     """
 
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride=1,
+                 padding=0,
+                 output_padding=0,
                  dilation=1,
                  groups=1,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3DTranspose, self).__init__()
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._use_cudnn = use_cudnn
-        self._act = act
-        self._dtype = dtype
-        self._data_format = data_format
-
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
+                 data_format="NCDHW"):
+        super(ConvTranspose3d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            3,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x, output_size):
         if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 3,
-                                                      'output_size')
+            output_padding = self.output_padding
         else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
+            output_padding = 0
 
-        filter_shape = [num_channels, num_filters // groups] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        out = F.conv3d_transpose(
-            input,
+        out = F.conv_transpose3d(
+            x,
             self.weight,
             bias=self.bias,
-            output_size=self._output_size,
             padding=self._padding,
+            output_padding=output_padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
+            output_size=output_size,
             data_format=self._data_format)
         return out
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0917441de3fea640204a3891ed03e9a451e3f0f
--- /dev/null
+++ b/python/paddle/nn/layer/distance.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['PairwiseDistance']
+
+import numpy as np
+
+import paddle
+from ...fluid.dygraph import layers
+from ...fluid.framework import core, in_dygraph_mode
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid.layer_helper import LayerHelper
+
+
+class PairwiseDistance(layers.Layer):
+    """
+    This operator computes the pairwise distance between two vectors. The
+    distance is calculated by p-oreder norm:
+
+    .. math::
+
+        \Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}.
+
+    Parameters:
+        p (float): The order of norm. The default value is 2.
+        epsilon (float, optional): Add small value to avoid division by zero,
+            default value is 1e-6.
+        keepdim (bool, optional): Whether to reserve the reduced dimension
+            in the output Tensor. The result tensor is one dimension less than
+            the result of ``'x-y'`` unless :attr:`keepdim` is True, default
+            value is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        x: :math:`(N, D)` where `D` is the dimension of vector, available dtype
+            is float32, float64.
+        y: :math:`(N, D)`, y have the same shape and dtype as x.
+        out: :math:`(N)`. If :attr:`keepdim` is ``True``, the out shape is :math:`(N, 1)`.
+            The same dtype as input tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([[1., 3.], [3., 5.]]).astype(np.float64)
+            y_np = np.array([[5., 6.], [7., 8.]]).astype(np.float64)
+            x = paddle.to_variable(x_np)
+            y = paddle.to_variable(y_np)
+            dist = paddle.nn.PairwiseDistance()
+            distance = dist(x, y)
+            print(distance.numpy()) # [5. 5.]
+
+    """
+
+    def __init__(self, p=2., epsilon=1e-6, keepdim=False, name=None):
+        super(PairwiseDistance, self).__init__()
+        self.p = p
+        self.epsilon = epsilon
+        self.keepdim = keepdim
+        self.name = name
+        check_type(self.p, 'porder', (float, int), 'PairwiseDistance')
+        check_type(self.epsilon, 'epsilon', (float), 'PairwiseDistance')
+        check_type(self.keepdim, 'keepdim', (bool), 'PairwiseDistance')
+
+    def forward(self, x, y):
+        if in_dygraph_mode():
+            sub = core.ops.elementwise_sub(x, y)
+            return core.ops.p_norm(sub, 'axis', 1, 'porder', self.p, 'keepdim',
+                                   self.keepdim, 'epsilon', self.epsilon)
+
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
+                                 'PairwiseDistance')
+        check_variable_and_dtype(y, 'y', ['float32', 'float64'],
+                                 'PairwiseDistance')
+        sub = paddle.elementwise_sub(x, y)
+
+        helper = LayerHelper("PairwiseDistance", name=self.name)
+        attrs = {
+            'axis': 1,
+            'porder': self.p,
+            'keepdim': self.keepdim,
+            'epsilon': self.epsilon,
+        }
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type='p_norm', inputs={'X': sub}, outputs={'Out': out}, attrs=attrs)
+
+        return out
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index d858d1352620399fdae477f6a6ca2db620abe2e2..de10e77eb1c000e66a7a914dc94ce39a6268bb61 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -12,20 +12,133 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define loss functions of neural network  
+# TODO: define loss functions of neural network
+import numpy as np
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
+from .. import functional as F
+from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
 
 __all__ = [
-    #       'NCELoss',
+    'BCEWithLogitsLoss',
     'CrossEntropyLoss',
     'MSELoss',
     'L1Loss',
     'NLLLoss',
-    'BCELoss'
+    'BCELoss',
+    'KLDivLoss',
+    'MarginRankingLoss',
+    'CTCLoss',
+    'SmoothL1Loss',
 ]
 
 
+class BCEWithLogitsLoss(fluid.dygraph.Layer):
+    """
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+
+    First this operator calculate loss function as follows:
+
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+
+    Args:
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. Default is ``'None'``.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'mean'``.
+        pos_weight (Tensor, optional): A weight of positive examples. Must be a vector
+            with length equal to the number of classes. The data type is float32, float64.
+            Default is ``'None'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shapes:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``logit`` , else the shape of output is scalar.
+
+    Returns:
+        A callable object of BCEWithLogitsLoss.
+
+    Examples:
+
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
+            output = bce_logit_loss(logit, label)
+            print(output.numpy())  # [0.45618808]
+
+    """
+
+    def __init__(self,
+                 weight=None,
+                 reduction='mean',
+                 pos_weight=None,
+                 name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in BCEWithLogitsLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+
+        super(BCEWithLogitsLoss, self).__init__()
+        self.weight = weight
+        self.reduction = reduction
+        self.pos_weight = pos_weight
+        self.name = name
+
+    def forward(self, logit, label):
+        out = paddle.nn.functional.binary_cross_entropy_with_logits(
+            logit, label, self.weight, self.reduction, self.pos_weight,
+            self.name)
+        return out
+
+
 class CrossEntropyLoss(fluid.dygraph.Layer):
     """
 	:alias_main: paddle.nn.CrossEntropyLoss
@@ -55,8 +168,8 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
     Parameters:
         input (Variable): Input tensor, the data type is float32, float64. Shape is
 	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1. 
-        label (Variable): Label tensor, the data type is int64. Shape is (N), where each 
+	    is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Variable): Label tensor, the data type is int64. Shape is (N), where each
 	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
 	    (N, D1, D2,..., Dk), k >= 1.
         weight (Variable, optional): Weight tensor, a manual rescaling weight given
@@ -112,7 +225,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                 print(output.numpy())
     """
 
-    def __init__(self, weight=None, reduction='mean', ignore_index=-100):
+    def __init__(self, weight=None, ignore_index=-100, reduction='mean'):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
@@ -130,25 +243,16 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                 " 'none', but received %s, which is not allowed." %
                 self.reduction)
 
-        log_softmax = paddle.nn.LogSoftmax()
-        log_softmax_out = log_softmax(input)
-        if self.weight is not None and not isinstance(self.weight,
-                                                      fluid.framework.Variable):
-            raise ValueError(
-                "The weight' is not a Variable, please convert to Variable.")
-        nll_loss = paddle.nn.loss.NLLLoss(
+        return paddle.nn.functional.cross_entropy(
+            input,
+            label,
             weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
-
-        return nll_loss(log_softmax_out, label)
+            ignore_index=self.ignore_index,
+            reduction=self.reduction)
 
 
 class MSELoss(fluid.dygraph.layers.Layer):
     """
-	:alias_main: paddle.nn.MSELoss
-	:alias: paddle.nn.MSELoss,paddle.nn.layer.MSELoss,paddle.nn.layer.loss.MSELoss
-
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
 
@@ -170,55 +274,34 @@ class MSELoss(fluid.dygraph.layers.Layer):
     where `input` and `label` are `float32` tensors of same shape.
 
     Parameters:
-        input (Variable): Input tensor, the data type is float32,
-        label (Variable): Label tensor, the data type is float32,
         reduction (string, optional): The reduction method for the output,
             could be 'none' | 'mean' | 'sum'.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. 
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned. 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
 
-    Returns:
-        The tensor variable storing the MSE loss of input and label.
-
-    Return type:
-        Variable.
+    Shape:
+        input (Tensor): Input tensor, the data type is float32 or float64
+        label (Tensor): Label tensor, the data type is float32 or float64
+        output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            from paddle import fluid
-            import paddle.fluid.dygraph as dg
 
-            mse_loss = paddle.nn.loss.MSELoss()
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            place = fluid.CPUPlace()
             input_data = np.array([1.5]).astype("float32")
             label_data = np.array([1.7]).astype("float32")
 
-            # declarative mode
-            output = mse_loss(input,label)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            output_data = exe.run(
-                fluid.default_main_program(),
-                feed={"input":input_data, "label":label_data},
-                fetch_list=[output],
-                return_numpy=True)
-            print(output_data)
-            # [array([0.04000002], dtype=float32)]
-
-            # imperative mode
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = mse_loss(input, label)
-                print(output.numpy())
-                # [0.04000002]
+            paddle.disable_static()
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = mse_loss(input, label)
+            print(output.numpy())
+            # [0.04000002]
     """
 
     def __init__(self, reduction='mean'):
@@ -231,10 +314,10 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
     def forward(self, input, label):
         if not fluid.framework.in_dygraph_mode():
-            fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                       ['float32'], 'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                                       ['float32'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'MSELoss')
 
         square_out = fluid.layers.square(
             fluid.layers.elementwise_sub(input, label))
@@ -250,180 +333,159 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
 class L1Loss(fluid.dygraph.Layer):
     """
-	:alias_main: paddle.nn.L1Loss
-	:alias: paddle.nn.L1Loss,paddle.nn.layer.L1Loss,paddle.nn.layer.loss.L1Loss
-
     This interface is used to construct a callable object of the ``L1Loss`` class.
-    The L1Loss layer calculates the L1 Loss of input predictions and target 
-    labels as follows.
+    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
+
+     If `reduction` set to ``'none'``, the loss is:
 
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
     .. math::
-        Out = |input - label|
-    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+        Out = \lvert input - label\rvert
+
+    If `reduction` set to ``'mean'``, the loss is:
+
     .. math::
-        Out = MEAN(|input - label|)
-    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+        Out = MEAN(\lvert input - label\rvert)
+
+    If `reduction` set to ``'sum'``, the loss is:
+
     .. math::
-        Out = SUM(|input - label|)
+        Out = SUM(\lvert input - label\rvert)
+
 
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions.
-    If :attr:`reduction` is ``'none'``, the shape of output loss is [N, *], the same as input.
-    If :attr:`reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1], which means the output is a scalar.
-    
     Parameters:
-        reduction (str, optional): Indicate the reduction to apply to the loss, 
+        reduction (str, optional): Indicate the reduction to apply to the loss,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned; 
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. 
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
             Default is ``'mean'``.
-    Returns:
-        A callable object of L1Loss.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        output (Tensor): The L1 Loss of ``input`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+
     Examples:
         .. code-block:: python
-            # declarative mode
-            import paddle.fluid as fluid
-            import numpy as np
             import paddle
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            l1_loss = paddle.nn.loss.L1Loss(reduction='mean')
-            output = l1_loss(input,label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-    
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-    
-            print(output_data)  # [array([0.2], dtype=float32)]
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                l1_loss = paddle.nn.loss.L1Loss(reduction='mean')
-                output = l1_loss(input,label)
-                print(output.numpy())  # [0.2]
+            import numpy as np
+
+            paddle.disable_static()
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+
+            l1_loss = paddle.nn.loss.L1Loss()
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [0.35]
+
+            l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [1.4]
+
+            l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [[0.20000005 0.19999999]
+            # [0.2        0.79999995]]
     """
 
-    def __init__(self, reduction='mean'):
+    def __init__(self, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
                 "received %s, which is not allowed." % reduction)
         super(L1Loss, self).__init__()
         self.reduction = reduction
+        self.name = name
 
     def forward(self, input, label):
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-
-        unreduced = fluid.layers.elementwise_sub(input, label, act='abs')
-
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(unreduced)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(unreduced)
-        else:
-            return unreduced
+        return paddle.nn.functional.l1_loss(
+            input, label, self.reduction, name=self.name)
 
 
 class BCELoss(fluid.dygraph.Layer):
     """
-	:alias_main: paddle.nn.BCELoss
-	:alias: paddle.nn.BCELoss,paddle.nn.layer.BCELoss,paddle.nn.layer.loss.BCELoss
-
     This interface is used to construct a callable object of the ``BCELoss`` class.
-    The BCELoss layer measures the binary_cross_entropy loss between input predictions 
-    and target labels. The binary_cross_entropy loss can be described as:
+    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
+    and target labels ``label`` . The binary_cross_entropy loss can be described as:
 
     If :attr:`weight` is set, the loss is:
 
     .. math::
         Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
+
     If :attr:`weight` is None, the loss is:
 
     .. math::
         Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
 
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
+    If :attr:`reduction` set to ``'none'``, the interface will return the original loss `Out`.
 
-    .. math::
-        Out = Out
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
 
     .. math::
         Out = MEAN(Out)
+
     If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
 
     .. math::
         Out = SUM(Out)
 
-    Note that the input predictions always be the output of sigmoid, and the target labels 
+    Note that the input predictions ``input`` always be the output of sigmoid, and the target labels ``label``
     should be numbers between 0 and 1.
 
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions. If ``reduction`` is ``'none'``, the shape of 
-    output is scalar, else the shape of output is same as input.
-
     Parameters:
-        weight (Variable, optional): A manual rescaling weight given to the loss of each 
-            batch element. If given, has to be a Variable of size nbatch and the data type
+        weight (Tensor, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, has to be a Tensor of size nbatch and the data type
             is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size, 
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
             Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): 2-D tensor with shape: (N, *), N is batch_size, `*` means
+            number of additional dimensions. The input ``input`` should always
+            be the output of sigmod.  Available dtype is float32, float64.
+        label (Tensor): 2-D tensor with the same shape as ``input``. The target
+            labels which values should be numbers between 0 and 1. Available
+            dtype is float32, float64.
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+            same as ``input`` , else the shape of output is scalar.
 
-    Returns: 
+    Returns:
         A callable object of BCELoss.
 
     Examples:
         .. code-block:: python
 
-            # declarative mode
-            import paddle.fluid as fluid
             import numpy as np
             import paddle
-            input = fluid.data(name="input", shape=[3, 1], dtype='float32')
-            label = fluid.data(name="label", shape=[3, 1], dtype='float32')
-            bce_loss = paddle.nn.loss.BCELoss()
-            output = bce_loss(input, label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-    
             input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
             label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-    
-            print(output_data)  # [array([0.65537095], dtype=float32)]
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = bce_loss(input, label)
-                print(output.numpy())  # [0.65537095]
+
+            paddle.disable_static()
+            input = paddle.to_variable(input_data)
+            label = paddle.to_variable(label_data)
+            bce_loss = paddle.nn.loss.BCELoss()
+            output = bce_loss(input, label)
+            print(output.numpy())  # [0.65537095]
+            paddle.enable_static()
+
     """
 
-    def __init__(self, weight=None, reduction='mean'):
+    def __init__(self, weight=None, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
@@ -432,38 +494,12 @@ class BCELoss(fluid.dygraph.Layer):
         super(BCELoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
+        self.name = name
 
     def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
-
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'bce_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'bce_loss')
-
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        self._helper.append_op(
-            type='bce_loss',
-            inputs={
-                'X': [input],
-                'Label': [label],
-            },
-            outputs={'Out': [out]})
-
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                w = self.weight
-                out = fluid.layers.elementwise_mul(out, w, axis=-1)
-            else:
-                raise ValueError(
-                    "The weight is not a Variable, please convert to Variable.")
-
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(out)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(out)
-        else:
-            return out
+        out = paddle.nn.functional.binary_cross_entropy(
+            input, label, self.weight, self.reduction, self.name)
+        return out
 
 
 class NLLLoss(fluid.dygraph.Layer):
@@ -471,20 +507,20 @@ class NLLLoss(fluid.dygraph.Layer):
 	:alias_main: paddle.nn.NLLLoss
 	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
 
-    This op accepts input and target label and returns negative log likelihood 
+    This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
-     
+
     The input for the loss is epected to contain log-probabilities of
-    each classes. It hs to be a Tensor of size either (batch_size, C) or 
+    each classes. It has to be a Tensor of size either (batch_size, C) or
     (batch_size, C, d1, d2, ..., dK) with K >= 1 for the K-dimensional case.
     The label for the loss should be a class index in the range [0, C-1]
     where C is the number of classes. If ignore_index is specified, the
     specified target value does not contribute to the input gradient.
-    
+
     If the optional argument `weight` is provided, it should be a 1D Tensor
     assigning weight to each of the classed. This is particularly useful
     when you have an unbalanced training set.
- 
+
     The loss is calculated as follows.
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 
@@ -505,106 +541,379 @@ class NLLLoss(fluid.dygraph.Layer):
         \\end{cases}
 
     Parameters:
-        input (Variable): Input tensor, the data type is float32, float64. 
-        label (Variable): Label tensor, the data type is int64_t.
-        weight (Variable, optional): Weight tensor, a manual rescaling weight given
-            to each class. If given, it has to be a Tensor of size `C`. Otherwise,
-            it treated as if having all ones. the data type is 
+        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
+            to each class. If given, it has to be a 1D Tensor whose size is `[C, ]`. Otherwise,
+            it treated as if having all ones. the data type is
             float32, float64, Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
-            Default is ``'mean'``.
         ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient.
+        reduction (str, optional): Indicate how to average the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'mean'``, the reduced mean loss is returned;
+            if `reduction` is ``'sum'``, the reduced sum loss is returned;
+            if `reduction` is ``'none'``, no reduction will be apllied.
+            Default is ``'mean'``.
+         name (str, optional): Name for the operation (optional, default is None).
+             For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
+            But in K-dimension situation, the shape is :math:`[N, C, d_1, d_2, ..., d_K]`.
+            The data type is float32, float64.
+        label (Tensor): Label tensor, the shape is :math:`[N,]` or :math:`[N, d_1, d_2, ..., d_K]`.
+            The data type is int64.
+        output (Tensor): the `negative log likelihood loss` between input `x` and `label`.
+            If `reduction` is `'none'`, the shape is `[N, *]`.
+            If `reduction` is `'sum'` or `'mean'`, the shape is `[1]`.
+
+    Examples:
+        .. code-block:: python
+
+                import paddle
+                import numpy as np
+
+                nll_loss = paddle.nn.layer.NLLLoss()
+                log_softmax = paddle.nn.LogSoftmax(axis=1)
+
+                input_np = np.array([[0.88103855, 0.9908683 , 0.6226845 ],
+                                 [0.53331435, 0.07999352, 0.8549948 ],
+                                 [0.25879037, 0.39530203, 0.698465  ],
+                                 [0.73427284, 0.63575995, 0.18827209],
+                                 [0.05689114, 0.0862954 , 0.6325046 ]]).astype(np.float32)
+                label_np = np.array([0, 2, 1, 1, 0]).astype(np.int64)
+
+                place = paddle.CPUPlace()
+                paddle.disable_static(place)
+                input = paddle.to_variable(input_np)
+                log_out = log_softmax(input)
+                label = paddle.to_variable(label_np)
+                result = nll_loss(log_out, label)
+                print(result.numpy()) # [1.0720209]
+
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=-100,
+                 reduction='mean',
+                 name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
+                "'none', but received %s, which is not allowed." % reduction)
+        super(NLLLoss, self).__init__()
+        self._weight = weight
+        self._ignore_index = ignore_index
+        self._reduction = reduction
+        self._name = name
+
+    def forward(self, input, label):
+        return F.nll_loss(
+            input,
+            label,
+            weight=self._weight,
+            ignore_index=self._ignore_index,
+            reduction=self._reduction,
+            name=self._name)
+
+
+class KLDivLoss(fluid.dygraph.Layer):
+    """
+    This interface calculates the Kullback-Leibler divergence loss
+    between Input(X) and Input(Target). Notes that Input(X) is the
+    log-probability and Input(Target) is the probability.
+
+    KL divergence loss is calculated as follows:
+
+    $$l(x, y) = y * (\log(y) - x)$$
+
+    Parameters:
+        reduction (str, optional): Indicate how to average the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            Default is ``'mean'``.
+
+    Shape:
+      - input: (N, *) where * means, any number of additional dimensions.
+      - label: (N, *), same shape as input
+      - output: tensor with shape: (1) by default.
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            import paddle.nn as nn
+
+            paddle.enable_imperative()
+
+            shape = (5, 20)
+            x = np.random.uniform(-10, 10, shape).astype('float32')
+            target = np.random.uniform(-10, 10, shape).astype('float32')
+
+            # 'batchmean' reduction, loss shape will be [N]
+            kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[5]
+
+            # 'mean' reduction, loss shape will be [1]
+            kldiv_criterion = nn.KLDivLoss(reduction='mean')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[1]
+
+            # 'sum' reduction, loss shape will be [1]
+            kldiv_criterion = nn.KLDivLoss(reduction='sum')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[1]
+
+            # 'none' reduction, loss shape is same with X shape
+            kldiv_criterion = nn.KLDivLoss(reduction='none')
+            pred_loss = kldiv_criterion(paddle.to_variable(x),
+                                        paddle.to_variable(target))
+            # shape=[5, 20]
+    """
+
+    def __init__(self, reduction='mean'):
+        super(KLDivLoss, self).__init__()
+        self.reduction = reduction
+
+    def forward(self, input, label):
+        out = paddle.nn.functional.kl_div(input, label, self.reduction)
+        return out
+
+
+class MarginRankingLoss(fluid.dygraph.Layer):
+    """
+
+    This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
+    The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
+    , use the math function as follows.
+
+    .. math::
+        margin\_rank\_loss = max(0, -label * (input - other) + margin)
+
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
+    .. math::
+        Out = MEAN(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
+    .. math::
+        Out = SUM(margin\_rank\_loss)
+
+    If :attr:`reduction` set to ``'none'``, just return the origin ``margin_rank_loss``.
+
+    Parameters:
+        margin (float, optional): The margin value to add, default value is 0;
+        reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input: N-D Tensor, the shape is [N, *], N is batch size and `*` means any number of additional dimensions., available dtype is float32, float64.
+        other: N-D Tensor, `other` have the same shape and dtype as `input`.
+        label: N-D Tensor, label have the same shape and dtype as `input`.
+        output: If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
 
     Returns:
-        The tensor variable storing the nll_loss.
+        A callable object of MarginRankingLoss.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype("float32"))
+            other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype("float32"))
+            label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype("float32"))
+            margin_rank_loss = paddle.nn.MarginRankingLoss()
+            loss = margin_rank_loss(input, other, label)
+            print(loss.numpy()) # [0.75]
+    """
+
+    def __init__(self, margin=0.0, reduction='mean', name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+        super(MarginRankingLoss, self).__init__()
+        self.margin = margin
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input, other, label):
+        out = paddle.nn.functional.margin_ranking_loss(
+            input, other, label, self.margin, self.reduction, self.name)
+        return out
+
+
+class CTCLoss(fluid.dygraph.Layer):
+    """
+	:alias_main: paddle.nn.CTCLoss
+	:alias: paddle.nn.CTCLoss, paddle.nn.layer.CTCLoss, paddle.nn.layer.loss.CTCLoss
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+
+    Shape:
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+
+    Returns:
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
 
-    Return type: Variable.
-    
     Examples:
 
         .. code-block:: python
 
             # declarative mode
-            import paddle.fluid as fluid
             import numpy as np
             import paddle
 
-            input_np = np.random.random(size=(10, 10)).astype(np.float32)
-            label_np = np.random.randint(0, 10, size=(10,)).astype(np.int64)
-            prog = fluid.Program()
-            startup_prog = fluid.Program()
-            place = fluid.CPUPlace()
-            with fluid.program_guard(prog, startup_prog):
-                input = fluid.data(name='input', shape=[10, 10], dtype='float32')
-                label = fluid.data(name='label', shape=[10], dtype='int64')
-                nll_loss = paddle.nn.loss.NLLLoss()
-                res = nll_loss(input, label)
-
-                exe = fluid.Executor(place)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "label": label_np},
-                    fetch_list=[res])
-            print(static_result)
-            
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_np)
-                label = dg.to_variable(label_np)
-                output = nll_loss(input, label)
-                print(output.numpy())
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_tensor(log_probs)
+            labels = paddle.to_tensor(labels)
+            input_lengths = paddle.to_tensor(input_lengths)
+            label_lengths = paddle.to_tensor(label_lengths)
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels,
+                input_lengths,
+                label_lengths)
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels,
+                input_lengths,
+                label_lengths)
+            print(loss.numpy())  #[1.1376063]
     """
 
-    def __init__(self, weight=None, reduction='mean', ignore_index=-100):
-        super(NLLLoss, self).__init__()
-        self.weight = weight
+    def __init__(self, blank=0, reduction='mean'):
+        super(CTCLoss, self).__init__()
+        self.blank = blank
         self.reduction = reduction
-        self.ignore_index = ignore_index
 
-    def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
+    def forward(self, log_probs, labels, input_lengths, label_lengths):
+        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
+                                             label_lengths, self.blank,
+                                             self.reduction)
 
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'nll_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'nll_loss')
 
-        if self.reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in nll_loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % self.reduction)
-
-        x_shape = list(input.shape)
-        n = x_shape[0]
-        c = x_shape[1]
-        x_dims = len(x_shape)
-        if x_dims < 2:
-            raise ValueError('Expected 2 or more dimensions (got {})'.format(
-                x_dims))
-        if x_dims != 2 and x_dims != 4:
-            input = fluid.layers.reshape(input, shape=[n, c, 1, -1])
-            label = fluid.layers.reshape(label, shape=[n, 1, -1])
-            out_shape = [n] + x_shape[2:]
-
-        inputs = {'X': input, 'Label': label}
-        attrs = {'reduction': self.reduction, 'ignore_index': self.ignore_index}
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                inputs['Weight'] = self.weight
-
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        total_weight = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        outputs = {'Out': out, 'Total_weight': total_weight}
-
-        self._helper.append_op(
-            type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
-        if x_dims != 2 and x_dims != 4 and self.reduction == 'none':
-            out = fluid.layers.reshape(out, shape=out_shape)
+class SmoothL1Loss(fluid.dygraph.Layer):
+    """
+    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
+    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
+    In some cases it can prevent exploding gradients and it is more robust and less
+    sensitivity to outliers. Also known as the Huber loss:
 
-        return out
+    .. math::
+
+         loss(x,y)=\\frac{1}{n}\\sum_{i}z_i
+
+    where z_i is given by:
+
+    .. math::
+
+         \\mathop{z_i}=\\left\\{\\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
+        \\end{array} \\right.
+
+    Parameters:
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        delta (float, optional): Specifies the hyperparameter delta to be used.
+            The value determines how large the errors need to be to use L1. Errors
+            smaller than delta are minimized with L2. Parameter is ignored for
+            negative/zero values. Default = 1.0
+        name (str, optional): Name for the operation (optional, default is
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Call Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is
+            (N, C), where C is number of classes, and if shape is more than 2D, this
+            is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label
+            is the same as the shape of input.
+
+    Returns:
+        The tensor variable storing the smooth_l1_loss of input and label.
+
+    Return type: Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(3,3).astype("float32")
+            label_data = np.random.rand(3,3).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            loss = paddle.nn.SmoothL1Loss()
+            output = loss(input, label)
+            print(output.numpy())
+    """
+
+    def __init__(self, reduction='mean', delta=1.0, name=None):
+        super(SmoothL1Loss, self).__init__()
+        self.reduction = reduction
+        self.delta = delta
+        self.name = name
+
+    def forward(self, input, label):
+        return F.smooth_l1_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            delta=self.delta,
+            name=self.name)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 1beba62c1809ffd94a22712fb24ac43a0ec23ff1..c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -1,4 +1,17 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,10 +30,1085 @@
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+
+#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
+from ...fluid.dygraph import layers
+
+from ...framework import get_default_dtype, set_default_dtype
+from ...fluid.framework import in_dygraph_mode
+
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid import core, dygraph_utils
+
+from ..functional import batch_norm, layer_norm, instance_norm
+
+import numpy as np
+import numbers
+import warnings
+
 __all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm'
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
 ]
+
+
+class _InstanceNormBase(layers.Layer):
+    """
+    This class is based class for InstanceNorm1d, 2d, 3d. 
+
+    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.9,
+                 weight_attr=None,
+                 bias_attr=None,
+                 track_running_stats=False,
+                 data_format="NCHW",
+                 name=None):
+        super(_InstanceNormBase, self).__init__()
+
+        if weight_attr == False or bias_attr == False:
+            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if weight_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_features],
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("InstanceNorm Base error")
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
+
+
+class InstanceNorm1d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCL `[batch, in_channels, length]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm2d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm3d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class GroupNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+    Parameters:
+        num_channels(int): The number of channels of input.
+        num_groups(int): The number of groups that divided from channels.
+        epsilon(float, optional): The small value added to the variance to prevent
+                                  division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        If it is set to None, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
+        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm_out = group_norm(x)
+
+          print(group_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_groups,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_layout='NCHW',
+                 name=None):
+        super(GroupNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._num_channels = num_channels
+        self._num_groups = num_groups
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+        param_shape = [self._num_channels]
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr or False,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+
+        self.bias = self.create_parameter(
+            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self.bias is not None:
+            inputs['Bias'] = self.bias
+        if self.weight is not None:
+            inputs['Scale'] = self.weight
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._num_groups})
+
+        return self._helper.append_activation(group_norm_out, None)
+
+
+class LayerNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    ..  math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
+
+    Parameters:
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+
+        self._normalized_shape = list(normalized_shape)
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        param_shape = [np.prod(self._normalized_shape)]
+
+        if weight_attr is False:
+            self.weight = None
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon)
+
+
+class _BatchNormBase(layers.Layer):
+    """
+    BatchNorm base .
+    """
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 track_running_stats=True,
+                 name=None):
+        super(_BatchNormBase, self).__init__()
+        self._num_features = num_features
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if get_default_dtype() == 'float16':
+            set_default_dtype('float32')
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = (self._weight_attr is False) or (
+            self._weight_attr and self._weight_attr.learning_rate == 0.)
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
+        self.bias.stop_gradient = (self._bias_attr is False) or (
+            self._bias_attr and self._bias_attr.learning_rate == 0.)
+
+        moving_mean_name = None
+        moving_variance_name = None
+
+        if name is not None:
+            moving_mean_name = name + "_mean"
+            moving_variance_name = name + "_variance"
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._data_format = data_format
+        self._in_place = False
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._fuse_with_relu = False
+        self._track_running_stats = track_running_stats
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("BatchNorm Base error")
+
+    def forward(self, input):
+
+        self._check_input_dim(input)
+
+        if not self.training and not self._track_running_stats:
+            raise ValueError(
+                'When inference, expected track_running_stats is True.')
+
+        if self.training and not self._track_running_stats:
+            warnings.warn(
+                "When training, we now always track global mean and variance.")
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format)
+
+
+class BatchNorm1d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+    
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm2d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm3d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class SyncBatchNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in training mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
+
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The formula of normalization is as follows:
+ 
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\\beta` : trainable shift parameter vector 
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
+        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
+             running variance. Default: True.
+
+    Shapes:
+        input: Tensor that the dimension from 2 to 5.
+        output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(x)
+          if paddle.fluid.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1.numpy())
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-05,
+                 momentum=0.9,
+                 track_running_stats=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 name=None):
+        super(SyncBatchNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._num_features = num_features
+        self._data_layout = data_format
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
+
+        if self._track_running_stats == False:
+            warnings.warn(
+                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
+            )
+
+        param_shape = [self._num_features]
+
+        # create parameter
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+    def forward(self, x):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        ### train mode: use mini-batch stats, eval mode: use global stats
+        ### use_global_stats only support False in sync_batch_norm
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     False, "use_global_stats", False, 'trainable_statistics',
+                     False)
+            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance, mean_out,
+                variance_out, *attrs)
+
+            return sync_batch_norm_out
+
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'BatchNorm')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": False,
+            "use_global_stats": False,
+            "trainable_statistics": False,
+        }
+
+        inputs = {
+            "X": [x],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [sync_batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return sync_batch_norm_out
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
new file mode 100755
index 0000000000000000000000000000000000000000..87fa0caec9ee287c42d8308d9da25c6d2fc9b911
--- /dev/null
+++ b/python/paddle/nn/layer/pooling.py
@@ -0,0 +1,877 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ...fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ...fluid.layers import utils
+from ...fluid.dygraph import layers
+from ...fluid.layer_helper import LayerHelper
+from .. import functional as F
+
+__all__ = [
+    'AdaptiveAvgPool2d',
+    'AdaptiveAvgPool3d',
+    'AvgPool1d',
+    'maxPool1d',
+    'AdaptiveMaxPool1d',
+    'AdaptiveAvgPool1d',
+    'AvgPool2d',
+    'MaxPool2d',
+    'AvgPool3d',
+    'MaxPool3d',
+]
+
+
+class AdaptiveAvgPool2d(layers.Layer):
+    """
+
+    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two element, (H, W). H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
+            the order of: [batch_size, input_channels, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32 or float64.
+        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool2d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool2d
+            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            # of input data into m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         for j in range(n):
+            #             hstart = floor(i * H / m)
+            #             hend = ceil((i + 1) * H / m)
+            #             wstart = floor(i * W / n)
+            #             wend = ceil((i + 1) * W / n)
+            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            #
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out.shape is [2, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCHW", name=None):
+        super(AdaptiveAvgPool2d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AdaptiveAvgPool3d(layers.Layer):
+    """
+
+    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
+    of the output tensor are determined by the parameter output_size.
+
+    For avg adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
+
+    Parameters:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            the size will be the same as that of the input.
+        data_format (str): The data format of the input and output data. An optional string
+            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
+            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Shape:
+        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32 or float64.
+        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+
+    Returns:
+        A callable object of AdaptiveAvgPool3d.
+
+    Examples:
+        .. code-block:: python
+
+            # adaptive avg pool3d
+            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            # of input data into l * m * n grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive avg pool performs calculations as follow:
+            #
+            #     for i in range(l):
+            #         for j in range(m):
+            #             for k in range(n):
+            #                 dstart = floor(i * D / l)
+            #                 dend = ceil((i + 1) * D / l)
+            #                 hstart = floor(j * H / m)
+            #                 hend = ceil((j + 1) * H / m)
+            #                 wstart = floor(k * W / n)
+            #                 wend = ceil((k + 1) * W / n)
+            #                 output[:, :, i, j, k] =
+            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            input_data = np.random.rand(2, 3, 8, 32, 32)
+            x = paddle.to_tensor(input_data)
+            # x.shape is [2, 3, 8, 32, 32]
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            pool_out = adaptive_avg_pool(x = x)
+            # pool_out = [2, 3, 3, 3, 3]
+    """
+
+    def __init__(self, output_size, data_format="NCDHW", name=None):
+        super(AdaptiveAvgPool3d, self).__init__()
+        self._output_size = output_size
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name)
+
+
+class AvgPool1d(layers.Layer):
+    """
+    This operation applies a 1D average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`. If padding is non-zero,
+            then the input is implicitly zero-padded on both sides for padding number of points.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AvgPool1d = nn.AvgPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = AvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 count_include_pad=True,
+                 ceil_mode=False,
+                 name=None):
+        super(AvgPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.name = name
+
+    def forward(self, x):
+        out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
+                           self.count_include_pad, self.ceil_mode, self.name)
+        return out
+
+
+class MaxPool1d(layers.Layer):
+    """
+    Applies a 1D max pooling over an input signal composed of several input planes based
+    on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one integers.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain one integers.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be the following forms: `[pad_left, pad_right]`.
+        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
+            If it is set to False, the floor function will be used. Default False
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ValueError: If `padding` is a list or tuple but its length greater than 1.
+        ShapeError: If the input is not a 3-D.
+        ShapeError: If the output's shape calculated is not greater than 0.
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 name=None):
+        super(MaxPool1d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
+                           self.return_indices, self.ceil_mode, self.name)
+        return out
+
+
+class AdaptiveAvgPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For average adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+          # average adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+          #
+          import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
+          pool_out = AdaptiveAvgPool1d(data)
+          # pool_out shape: [1, 3, 16]
+    """
+
+    def __init__(self, output_size, name=None):
+        super(AdaptiveAvgPool1d, self).__init__()
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_avg_pool1d(input, self.output_size, self.name)
+
+
+class AdaptiveMaxPool1d(layers.Layer):
+    """
+
+    This operation applies a 1D adaptive max pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_indices parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    For max adaptive pool1d:
+
+    ..  math::
+
+       lstart &= floor(i * L_{in} / L_{out})
+
+       lend &= ceil((i + 1) * L_{in} / L_{out})
+
+       Output(i) &= max(Input[lstart:lend])}
+
+    Args:
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+             it must contain one int.
+        return_indices (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        None.
+
+    Raises:
+        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+
+    Examples:
+        .. code-block:: python
+
+          # max adaptive pool1d
+          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+          # output shape is [N, C, m], adaptive pool divide L dimension
+          # of input data into m grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         lstart = floor(i * L / m)
+          #         lend = ceil((i + 1) * L / m)
+          #         output[:, :, i] = max(input[:, :, lstart: lend])
+          #
+                    import paddle
+          import paddle.nn as nn
+          paddle.disable_static()
+
+          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
+          pool_out = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16]
+
+          # for return_indices = true
+          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1d(data)
+          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+
+    """
+
+    def __init__(self, output_size, return_indices=False, name=None):
+        super(AdaptiveMaxPool1d, self).__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+        self.name = name
+
+    def forward(self, input):
+        return F.adaptive_max_pool1d(input, self.output_size,
+                                     self.return_indices, self.name)
+
+
+class AvgPool2d(layers.Layer):
+    """
+    This operation applies 2D average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+           $$
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is `true`.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+
+    Returns: None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+                                stride=2, padding=0)
+          output = AvgPoo2d(input)
+          # output.shape [1, 3, 16, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCHW",
+                 name=None):
+        super(AvgPool2d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class MaxPool2d(layers.Layer):
+    """
+    This operation applies 2D max pooling over input feature based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+      Input:
+           X shape: $(N, C, H_{in}, W_{in})$
+      Attr:
+           kernel_size: ksize
+
+      Output:
+           Out shape: $(N, C, H_{out}, W_{out})$
+           $$
+           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+           $$
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int. Default: kernel_size.
+        padding (string|int|list|tuple): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when `data_format` is `"NCHW"`,
+            `pool_padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Otherwise, the pool padding size will be a square of an int.
+        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_indices (bool): Whether to return the max indices along with the outputs.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: None
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool2d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16]
+
+          # for return_indices=True
+          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2d(input)
+          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_indices=False,
+                 ceil_mode=False,
+                 data_format="NCHW",
+                 name=None):
+        super(MaxPool2d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_indices=self.return_indices,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class MaxPool3d(layers.Layer):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int. Default kernel_size.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): when True, will use ceil instead of floor to compute the output shape.
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # max pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
+          # for return_indices=True
+          MaxPool3d = nn.MaxPool3d(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool3d(input)
+          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding,
+                 return_indices=False,
+                 ceil_mode=False,
+                 data_format="NCDHW",
+                 name=None):
+        super(MaxPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_indices=self.return_indices,
+            data_format=self.data_format,
+            name=self.name)
+
+
+class AvgPool3d(layers.Layer):
+    """
+    This operation applies 3D max pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCDHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Args:
+        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+            is a tuple or list, it must contain three integers,
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (string|int|list|tuple)): The pool padding. If `pool_padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If pool stride size is a tuple or list,
+            it must contain three integers, `[stride_Depth, stride_Height, stride_Width]`.
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (int|list|tuple): The pool padding size. If pool padding size is a tuple or list,
+            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
+            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
+            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+        ceil_mode (bool): ${ceil_mode_comment}
+        count_include_pad (bool): Whether to exclude padding points in average pooling
+                          mode, default is True.
+        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
+        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns: None.
+    Raises:
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
+        ShapeError: If the output's shape calculated is not greater than 0.
+    Examples:
+        .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # avg pool3d
+          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+                                   stride=2, padding=0)
+          output = AvgPool3d(input)
+          # output.shape [1, 2, 3, 16, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 divisor_override=None,
+                 data_format="NCDHW",
+                 name=None):
+        super(AvgPool3d, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor = divisor_override
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            count_include_pad=self.count_include_pad,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name)
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 2b926b5ab369046fc07c3d3d8cd56431d7f740a7..50a8755ac9f7b0a8e35c60f02a9fb825195ab80f 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -13,4 +13,1107 @@
 # limitations under the License.
 
 # TODO: define the classes of Transformer neural network
-# __all__ = [ ]
+__all__ = [
+    'MultiHeadAttention',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'Transformer',
+]
+
+import copy
+import collections
+
+from ...fluid import layers
+from ...fluid.param_attr import ParamAttr
+from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
+from .. import functional as F
+from ...fluid.layers import utils
+from ...fluid.layers.utils import map_structure
+
+
+def _convert_param_attr_to_list(param_attr, n):
+    """
+    If `param_attr` is a list or tuple, convert every element in it to a
+    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+    construct a list, and rename every one by appending a increasing index
+    suffix to avoid having same names when `param_attr` contains a name.
+
+    Parameters:
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`.
+        n (int): The times to repeat to construct a list when `param_attr`
+            is not a list or tuple.
+
+    Returns:
+        list: A list composed of each including cell's `param_attr`.
+    """
+    if isinstance(param_attr, (list, tuple)):
+        assert len(param_attr) == n, (
+            "length of param_attr should be %d when it is a list/tuple" % n)
+        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    else:
+        param_attrs = []
+        attr = ParamAttr._to_attr(param_attr)
+        for i in range(n):
+            attr_i = copy.deepcopy(attr)
+            if attr.name:
+                attr_i.name = attr_i.name + "_" + str(i)
+            param_attrs.append(attr_i)
+    return param_attrs
+
+
+class MultiHeadAttention(Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :code:`ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr` .
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.q_proj = Linear(
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.k_proj = Linear(
+            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.v_proj = Linear(
+            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.out_proj = Linear(
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+
+    def _prepare_qkv(self, query, key, value, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`.
+            value (Tensor): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`.
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
+        q = self.q_proj(query)
+        q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if isinstance(cache, self.StaticCache):
+            # for encoder-decoder attention in inference and has cached
+            k, v = cache.k, cache.v
+        else:
+            k, v = self.compute_kv(key, value)
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = layers.concat([cache.k, k], axis=2)
+            v = layers.concat([cache.v, v], axis=2)
+            cache = self.Cache(k, v)
+
+        return (q, k, v) if cache is None else (q, k, v, cache)
+
+    def compute_kv(self, key, value):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces. The results are used as key-values pairs for subsequent multiple
+        parallel attention.
+        
+        It is part of calculations in multi-head attention, and is provided as
+        a method to pre-compute and prefetch these results, thus we can use them
+        to construct cache for inference.
+
+        Parameters:
+            key (Tensor): The keys for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, kdim]`. The data type
+                should be float32 or float64.
+            value (Tensor): The values for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, vdim]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: A tuple including transformed keys and values. Their shapes \
+                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
+                and their data types are same as inputs.
+        """
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+    def gen_cache(self, key, value=None, type=Cache):
+        """
+        Generates cache for `forward` usage in inference accroding to arguments.
+        The generated cache is an instance of `MultiHeadAttention.Cache` or an
+        instance of `MultiHeadAttention.StaticCache`.
+
+        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
+        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
+        which are results of linear projection, reshape and transpose calculations
+        in MultiHeadAttention.
+        
+        If the generated cache is an instance of `Cache`, `k` and `v` fields
+        reserve intermediate result tensors of previous positions, and the tensors
+        are incremental among decoding steps, which mostly are used for decoder
+        decoder self attention.
+        
+        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
+        would be used as calculated result tensors on keys an values in `forward`,
+        and the tensors keep unchanged among decoding steps, which are mostly used
+        for decoder-encoder cross attention.
+
+        The cache is generated as follows:
+
+        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
+        results to create an instance of `StaticCache`.
+        
+        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
+        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
+        to create an instance of `Cache`, where `batch_size` is from the first
+        dimension of `key`.
+
+        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
+        an instance of `Cache`.
+
+        Parameters:
+            key (Tensor): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If `value` is None,
+                it is only for batch size and data type reference.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, `key` is only
+                for batch size reference. Default None.
+            type (type): It should be `MultiHeadAttention.StaticCache` or
+                `MultiHeadAttention.Cache` to indicate the cache type to generate.
+        
+        Returns:
+            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
+        """
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.compute_kv(key, value)
+            return self.StaticCache(k, v)
+        elif value is None:  # incremental_state
+            k = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            v = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            return self.Cache(k, v)
+        else:
+            # incremental_state with initial value, mainly for usage like UniLM
+            return self.Cache(key, value)
+
+    def forward(self, query, key, value, attn_mask=None, cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        if cache is None:
+            q, k, v = self._prepare_qkv(query, key, value, cache)
+        else:
+            q, k, v, cache = self._prepare_qkv(query, key, value, cache)
+
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        if attn_mask is not None:
+            # TODO(guosheng): support bool mask
+            product = product + attn_mask
+        weights = layers.softmax(product)
+        if self.dropout:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=self.dropout,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+
+        out = layers.matmul(weights, v)
+
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerEncoderLayer(Layer):
+    """
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output accordingly. If
+    `normalize_before` is True, pre-process is layer normalization and post-precess
+    includes dropout, residual connection. Otherwise, no pre-process and post-precess
+    includes dropout, residual connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :code:`ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
+            The `False` value means the corresponding layer would not have trainable
+            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
+            which means the default bias parameter property is used.
+            
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
+
+        self.self_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.linear1 = Linear(
+            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a Transformer encoder layer on the input.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Tensor: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        # TODO(guosheng): Add cache for encoder for the usage like UniLM
+        src = self.self_attn(src, src, src, src_mask)
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(Layer):
+    """
+    TransformerEncoder is a stack of N encoder layers. 
+
+    Parameters:
+        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of encoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer, TransformerEncoder
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            encoder = TransformerEncoder(encoder_layer, 2)
+            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = LayerList([(encoder_layer if i == 0 else
+                                  type(encoder_layer)(**encoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last encoder
+        layer.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Tensor: The output of Transformer encoder. It is a tensor that \
+                has the same shape and data type as `src`.
+        """
+        output = src
+
+        for mod in self.layers:
+            output = mod(output, src_mask=src_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output accordingly. If `normalize_before` is True,
+    pre-process is layer normalization and post-precess includes dropout, residual
+    connection. Otherwise, no pre-process and post-precess includes dropout, residual
+    connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `weight_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_mask,
+                                   cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
+
+        self.self_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.cross_attn = MultiHeadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[1],
+            bias_attr=bias_attrs[1])
+        self.linear1 = Linear(
+            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.norm3 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout3 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+
+        Parameters:
+            tgt (Tensor): The input of Transformer decoder layer. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache`,
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache.
+                See `TransformerDecoderLayer.gen_cache` for more details. It is
+                only used for inference and should be None for training. Default
+                None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder layer. \
+                Or a tuple if `cache` is not None, except for decoder layer output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
+        else:
+            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
+                                                    cache[0])
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        if cache is None:
+            tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
+        else:
+            tgt, static_cache = self.cross_attn(tgt, memory, memory,
+                                                memory_mask, cache[1])
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt if cache is None else (tgt, (incremental_cache,
+                                                static_cache))
+
+    def gen_cache(self, memory):
+        """
+        Generates cache for `forward` usage. The generated cache is a tuple
+        composed of an instance of `MultiHeadAttention.Cache` and an instance
+        of `MultiHeadAttention.StaticCache`.
+
+        Parameters:
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache` \
+                produced by `self_attn.gen_cache(memory, MultiHeadAttention.Cache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache` \
+                produced by `cross_attn.gen_cache(memory, MultiHeadAttention.StaticCache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`.
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        incremental_cache = self.self_attn.gen_cache(
+            memory, type=self.self_attn.Cache)
+        static_cache = self.cross_attn.gen_cache(
+            memory, memory, type=self.cross_attn.StaticCache)
+        return incremental_cache, static_cache
+
+
+class TransformerDecoder(Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers. 
+
+    Parameters:
+        decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of decoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer, TransformerDecoder
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            decoder = TransformerDecoder(decoder_layer, 2)
+            output = decoder(dec_input,
+                             enc_output,
+                             self_attn_mask,
+                             cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+        self.layers = LayerList([(decoder_layer if i == 0 else
+                                  type(decoder_layer)(**decoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+
+        Parameters:
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder. \
+                Or a tuple if `cache` is not None, except for decoder output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
+                for more details.
+        """
+        output = tgt
+        new_caches = []
+        for i, mod in enumerate(self.layers):
+            if cache is None:
+                output = mod(output,
+                             memory,
+                             tgt_mask=tgt_mask,
+                             memory_mask=memory_mask,
+                             cache=None)
+            else:
+                output, new_cache = mod(output,
+                                        memory,
+                                        tgt_mask=tgt_mask,
+                                        memory_mask=memory_mask,
+                                        cache=cache[i])
+                new_caches.append(new_cache)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output if cache is None else (output, new_caches)
+
+    def gen_cache(self, memory, do_zip=False):
+        """
+        Generates cache for `forward` usage. The generated cache is a list, and
+        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
+        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
+        for more details. If `do_zip` is True, apply `zip` on these tuples to get
+        a list with two elements.
+
+
+        Parameters:
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            do_zip (bool, optional): Indicate whether to apply `zip` on the tuples.
+                If True, return a list with two elements. Default False
+
+        Returns:
+            list: It is a list, and each element in the list is a tuple produced \
+                by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \
+                for more details. If `do_zip` is True, apply `zip` on these tuples \
+                and return a list with two elements.
+        """
+        cache = [layer.gen_cache(memory) for layer in self.layers]
+        if do_zip:
+            cache = list(zip(*cache))
+        return cache
+
+
+class Transformer(Layer):
+    """
+    A Transformer model composed of an instance of `TransformerEncoder` and an
+    instance of `TransformerDecoder`. While the embedding layer and output layer
+    are not included.
+
+    Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
+    and see `TransformerEncoder` and `TransformerDecoder` for more details.
+    
+    Users can configurate the model architecture with corresponding parameters.
+    Note the usage of `normalize_before` representing where to apply layer
+    normalization (in pre-process or post-precess of multi-head attention or FFN),
+    and some transformer like models are different on this, such as
+    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
+    The default architecture here places layer normalization in post-process and
+    applies another layer normalization on the output of last encoder/decoder layer.
+
+    Parameters:
+        d_model (int): The expected feature size in the encoder/decoder input
+            and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        num_encoder_layers (int): The number of layers in encoder.
+        num_encoder_layers (int): The number of layers in decoder.
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `weight_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :code:`ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
+        custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
+            Default None
+        custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
+            Default None
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import Transformer
+
+            # src: [batch_size, tgt_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # tgt: [batch_size, src_len, d_model]
+            dec_input = paddle.rand((2, 6, 128))
+            # src_mask: [batch_size, n_head, src_len, src_len]
+            enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
+            dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
+            # memory_mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 6, 4))
+            transformer = Transformer(128, 2, 4, 4, 512)
+            output = transformer(enc_input,
+                                 dec_input,
+                                 enc_self_attn_mask,
+                                 dec_self_attn_mask,
+                                 cross_attn_mask)  # [2, 6, 128]
+    """
+
+    def __init__(self,
+                 d_model=512,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 weight_attr=None,
+                 bias_attr=None,
+                 custom_encoder=None,
+                 custom_decoder=None):
+        super(Transformer, self).__init__()
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
+                bias_attr)
+            encoder_norm = LayerNorm(d_model)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                              encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
+                bias_attr)
+            decoder_norm = LayerNorm(d_model)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
+                                              decoder_norm)
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
+        """
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (Tensor): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Tensor): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Tensor, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+
+        Returns:
+            Tensor: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder.
+        """
+        memory = self.encoder(src, src_mask=src_mask)
+        output = self.decoder(
+            tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
+        return output
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f360ec02e6d8b59b80db4602776e904cf0b499
--- /dev/null
+++ b/python/paddle/nn/layer/vision.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define specitial functions used in computer vision task 
+
+from ...fluid.dygraph import layers
+from .. import functional
+
+__all__ = ['PixelShuffle']
+
+
+class PixelShuffle(layers.Layer):
+    """
+    
+    PixelShuffle Layer    
+
+    This operator rearranges elements in a tensor of shape [N, C, H, W]
+    to a tensor of shape [N, C/upscale_factor**2, H*upscale_factor, W*upscale_factor],
+    or from shape [N, H, W, C] to [N, H*upscale_factor, W*upscale_factor, C/upscale_factor**2].
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of 1/upscale_factor.
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
+    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
+    by Shi et. al (2016) for more details.
+
+    Parameters:
+
+        upscale_factor(int): factor to increase spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x: 4-D tensor with shape: (N, C, H, W) or (N, H, W, C).
+        - out: 4-D tensor with shape: (N, C/upscale_factor**2, H*upscale_factor, W*upscale_factor) or (N, H*upscale_factor, W*upscale_factor, C/upscale_factor^2).
+
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            paddle.disable_static()
+            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
+            x_var = paddle.to_tensor(x)
+            pixel_shuffle = nn.PixelShuffle(3)
+            out_var = pixel_shuffle(x_var)
+            out = out_var.numpy()
+            print(out.shape) 
+            # (2, 1, 12, 12)
+
+    """
+
+    def __init__(self, upscale_factor, data_format="NCHW", name=None):
+        super(PixelShuffle, self).__init__()
+
+        if not isinstance(upscale_factor, int):
+            raise TypeError("upscale factor must be int type")
+
+        if data_format not in ["NCHW", "NHWC"]:
+            raise ValueError("Data format should be 'NCHW' or 'NHWC'."
+                             "But recevie data format: {}".format(data_format))
+
+        self._upscale_factor = upscale_factor
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return functional.pixel_shuffle(x, self._upscale_factor,
+                                        self._data_format, self._name)
diff --git a/python/paddle/fleet/base/role_maker.py b/python/paddle/nn/utils/__init__.py
similarity index 84%
rename from python/paddle/fleet/base/role_maker.py
rename to python/paddle/nn/utils/__init__.py
index f6b5c8ac12e92dcbe6ca710f20d509cabaafac63..6562ac35e1e3180db671f90188f1304f07864189 100644
--- a/python/paddle/fleet/base/role_maker.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -11,6 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Defination of Role Makers."""
 
-# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+from . import weight_norm_hook
+from .weight_norm_hook import weight_norm, remove_weight_norm
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad53bf394660f3a7e0e48fdbd5eb530abd0852bb
--- /dev/null
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from ... import fluid
+from ...fluid import dygraph
+from ...fluid import layers as F
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...tensor.math import multiply
+
+__all__ = ['weight_norm', 'remove_weight_norm']
+
+
+def l2_norm(x, axis, epsilon=1e-12, name=None):
+    if len(x.shape) == 1:
+        axis = 0
+    check_variable_and_dtype(x, "X", ("float32", "float64"), "norm")
+
+    helper = LayerHelper("l2_normalize", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        })
+    return F.squeeze(norm, axes=[axis])
+
+
+def norm_except_dim(p, dim):
+    shape = p.shape
+    ndims = len(shape)
+    if dim == -1:
+        return F.sqrt(F.reduce_sum(F.square(p)) + 1e-12)
+    elif dim == 0:
+        p_matrix = F.reshape(p, (shape[0], -1))
+        return l2_norm(p_matrix, axis=1)
+    elif dim == ndims - 1:
+        p_matrix = F.reshape(p, (-1, shape[-1]))
+        return l2_norm(p_matrix, axis=0)
+    else:
+        perm = list(range(ndims))
+        perm[0] = dim
+        perm[dim] = 0
+        p_transposed = F.transpose(p, perm)
+        return norm_except_dim(p_transposed, 0)
+
+
+def _weight_norm(v, g, dim):
+    shape = v.shape
+    ndims = len(shape)
+
+    if dim == -1:
+        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
+    elif dim == 0:
+        p_matrix = F.reshape(v, (shape[0], -1))
+        v_normalized = F.l2_normalize(p_matrix, axis=1)
+        v_normalized = F.reshape(v_normalized, shape)
+    elif dim == ndims - 1:
+        p_matrix = F.reshape(v, (-1, shape[-1]))
+        v_normalized = F.l2_normalize(p_matrix, axis=0)
+        v_normalized = F.reshape(v_normalized, shape)
+    else:
+        perm = list(range(ndims))
+        perm[0] = dim
+        perm[dim] = 0
+        p_transposed = F.transpose(v, perm)
+        transposed_shape = p_transposed.shape
+        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
+        v_normalized = F.l2_normalize(p_matrix, axis=1)
+        v_normalized = F.reshape(v_normalized, transposed_shape)
+        v_normalized = F.transpose(v_normalized, perm)
+    weight = multiply(v_normalized, g, axis=dim if dim is not None else -1)
+    return weight
+
+
+class WeightNorm(object):
+    def __init__(self, name, dim):
+        if dim is None:
+            dim = -1
+        self.name = name
+        self.dim = dim
+
+    def compute_weight(self, layer):
+        g = getattr(layer, self.name + '_g')
+        v = getattr(layer, self.name + '_v')
+        return _weight_norm(v, g, self.dim)
+
+    @staticmethod
+    def apply(layer, name, dim):
+        for k, hook in layer._forward_pre_hooks.items():
+            if isinstance(hook, WeightNorm) and hook.name == name:
+                raise RuntimeError("Cannot register two weight_norm hooks on "
+                                   "the same parameter {}".format(name))
+
+        if dim is None:
+            dim = -1
+
+        fn = WeightNorm(name, dim)
+
+        w = getattr(layer, name)
+        del layer._parameters[name]
+
+        g_var = norm_except_dim(w, dim)
+        v = layer.create_parameter(w.shape, dtype=w.dtype)
+        layer.add_parameter(name + "_v", v)
+        g = layer.create_parameter(g_var.shape, dtype=g_var.dtype)
+        layer.add_parameter(name + '_g', g)
+        with dygraph.no_grad():
+            F.assign(w, v)
+            F.assign(g_var, g)
+        setattr(layer, name, fn.compute_weight(layer))
+
+        layer.register_forward_pre_hook(fn)
+        return fn
+
+    def remove(self, layer):
+        w_var = self.compute_weight(layer)
+        delattr(layer, self.name)
+        del layer._parameters[self.name + '_g']
+        del layer._parameters[self.name + '_v']
+        w = layer.create_parameter(w_var.shape, dtype=w_var.dtype)
+        layer.add_parameter(self.name, w)
+        with dygraph.no_grad():
+            F.assign(w_var, w)
+
+    def __call__(self, layer, inputs):
+        setattr(layer, self.name, self.compute_weight(layer))
+
+
+def weight_norm(layer, name='weight', dim=0):
+    """
+    This weight_norm layer applies weight normalization to a parameter according to the 
+    following formula:
+
+    .. math::
+
+        \mathbf{w} = g \dfrac{v}{\|v\|}
+
+    Weight normalization is a reparameterization of the weight vectors in a neural network that 
+    decouples the magnitude of those weight vectors from their direction. Weight normalization 
+    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter 
+    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction 
+    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper: 
+    `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number 
+              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0, 
+              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4. 
+              If dim is set to None, meaning that all elements will be normalized. Default: 0.
+    
+    Returns:
+        Origin layer with weight norm hook.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          from paddle.nn import Conv2D
+          from paddle.nn.utils import weight_norm
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          conv = Conv2D(3, 5, 3)
+          wn = weight_norm(conv)
+          print(conv.weight_g.shape)
+          # [5]
+          print(conv.weight_v.shape)
+          # [5, 3, 3, 3]
+    """
+    WeightNorm.apply(layer, name, dim)
+    return layer
+
+
+def remove_weight_norm(layer, name='weight'):
+    """
+    remove weight normalization from layer.
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+
+    Returns:
+        Origin layer without weight norm
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          from paddle.nn import Conv2D
+          from paddle.nn.utils import weight_norm, remove_weight_norm
+
+          paddle.disable_static()
+          conv = Conv2D(3, 5, 3)
+          wn = weight_norm(conv)
+          remove_weight_norm(conv)
+          print(conv.weight_g)
+          # AttributeError: 'Conv2D' object has no attribute 'weight_g'
+    """
+    for k, hook in layer._forward_pre_hooks.items():
+        if isinstance(hook, WeightNorm) and hook.name == name:
+            hook.remove(layer)
+            del layer._forward_pre_hooks[k]
+            return layer
+
+    raise ValueError("weight_norm of '{}' not found in {}".format(name, layer))
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 4dc3cf397aea59f3fedfc86bff7a77556a6a63a7..49314c9832dd389411dffb3f498b34d09337a3f0 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -14,21 +14,32 @@
 
 __all__ = [
     'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
-    'Adamax', 'AdamaxOptimizer', 'AdamOptimizer', 'DecayedAdagrad',
-    'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd',
-    'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
-    'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer',
-    'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
-    'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD',
-    'SGDOptimizer'
+    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
+    'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
+    'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
+    'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
+    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
+    'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer',
+    '_LRScheduler', 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR',
+    'PolynomialLR', 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR',
+    'LambdaLR', 'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 
 
-from ..fluid.optimizer import  SGD, Momentum, Adagrad, Adam, Adamax, Dpsgd, DecayedAdagrad, \
-            Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, \
-            AdamOptimizer, AdamaxOptimizer, DpsgdOptimizer, \
-            DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \
-            AdadeltaOptimizer, ModelAverage, LarsMomentum, \
-            LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \
+from ..fluid.optimizer import  SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
+            Ftrl, Adadelta, \
+            SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
+            DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
+            ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
             ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
-            RecomputeOptimizer
+            RecomputeOptimizer, LarsMomentumOptimizer
+
+from .optimizer import Optimizer
+from .adam import Adam
+from .adamw import AdamW
+from .adamax import Adamax
+from .rmsprop import RMSProp
+
+from . import lr_scheduler
+from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
+            LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da8053fe8a3495f5d3188a737638531347de648
--- /dev/null
+++ b/python/paddle/optimizer/adam.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["Adam"]
+
+
+class Adam(Optimizer):
+    """
+    The Adam optimizer uses an optimization described at the end
+    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
+    it can dynamically adjusts the learning rate of each parameter using
+    the 1st moment estimates and the 2nd moment estimates of the gradient.
+    
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+        .. code-block:: python
+
+            # Adam with beta1/beta2 as Tensor and weight_decay as float
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Adam, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adam"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(self._moment1_acc_str, p)
+            self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=0.9 if isinstance(self._beta1, Variable) \
+                        else self._beta1,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                fill_value=0.999 if isinstance(self._beta2, Variable) \
+                        else self._beta2,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+        # create the adam optimize op
+
+        if framework.in_dygraph_mode():
+            _beta1 = self._beta1 if not isinstance(
+                self._beta1, Variable) else self._beta1.numpy().item(0)
+            _beta2 = self._beta2 if not isinstance(
+                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _, _, _, _, _ = core.ops.adam(
+                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
+                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
+                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
+                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
+                1000, 'beta1', _beta1, 'beta2', _beta2)
+
+            return None
+
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "LearningRate": [lr],
+            "Moment1": [moment1],
+            "Moment2": [moment2],
+            "Beta1Pow": [beta1_pow_acc],
+            "Beta2Pow": [beta2_pow_acc]
+        }
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "Moment1Out": [moment1],
+            "Moment2Out": [moment2],
+            "Beta1PowOut": [beta1_pow_acc],
+            "Beta2PowOut": [beta2_pow_acc],
+        }
+        attrs = {
+            "epsilon": self._epsilon,
+            "lazy_mode": self._lazy_mode,
+            "min_row_size_to_use_multithread": 1000
+        }
+
+        if isinstance(self._beta1, Variable):
+            inputs['Beta1Tensor'] = self._beta1
+        else:
+            attrs['beta1'] = self._beta1
+        if isinstance(self._beta2, Variable):
+            inputs['Beta2Tensor'] = self._beta2
+        else:
+            attrs['beta2'] = self._beta2
+
+        adam_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return adam_op
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a78b17cbba55c1ee90a2708f6c163940158a51
--- /dev/null
+++ b/python/paddle/optimizer/adamax.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable, name_scope
+
+__all__ = ["Adamax"]
+
+
+class Adamax(Optimizer):
+    """
+    The Adamax optimizer is implemented based on the Adamax Optimization 
+    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
+    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
+    which makes the learning rate update algorithm more stable and simple.
+
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+
+        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+
+        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    The original paper does not have an ``epsilon`` attribute,
+    it is added here for numerical stability to prevent the division by 0 error.
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            The default value is 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    **Notes**:
+        **Currently, Adamax doesn't support sparse parameter optimization.**
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.Adamax(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+    _moment_acc_str = "moment"
+    _inf_norm_acc_str = "inf_norm"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Adamax, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adamax"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        # Create accumulator tensors for first moment and infinity norm
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+            self._add_accumulator(self._inf_norm_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=self._beta1,
+                shape=[1])
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
+        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
+                                         param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        # create the adamax optimize op
+        adamax_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "Moment": moment,
+                "InfNorm": inf_norm,
+                "Beta1Pow": beta1_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": moment,
+                "InfNormOut": inf_norm
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon
+            },
+            stop_gradient=True)
+
+        return adamax_op
+
+    def _finish_update(self, block, parameters_and_grads):
+        """Update Beta1 Power accumulator
+        """
+        assert isinstance(block, framework.Block)
+        for param, grad in parameters_and_grads:
+            if grad is None or param.trainable is False:
+                continue
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamax'):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..f498fcbffa24ec188b57ceb2d3c6884fc1e135d2
--- /dev/null
+++ b/python/paddle/optimizer/adamw.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from .adam import Adam
+from ..fluid import framework
+import paddle
+__all__ = ['AdamW']
+
+
+class DecoupledWeightDecay(object):
+    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
+        if not isinstance(coeff, float) and \
+                not isinstance(coeff, framework.Variable):
+            raise TypeError("coeff should be float or Tensor.")
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._coeff = coeff
+        super(DecoupledWeightDecay, self).__init__(**kwargs)
+
+    def _scale_parameters(self, params_and_grads):
+        """
+        Adds weight decay ops.
+            scaled_parameter = parameter * coeff
+
+        Args:
+            params_and_grads: A list of (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+        if isinstance(self._coeff, float) and self._coeff == 0.0:
+            return
+
+        scaled_params = []
+        for param, grad in params_and_grads:
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                continue
+            if self._apply_decay_param_fun is not None \
+                    and not self._apply_decay_param_fun(param.name):
+                continue
+
+            if isinstance(self._coeff, float):
+                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
+                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
+            else:
+                assert self._coeff.dtype == param.dtype, \
+                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
+
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                assert param.name not in self._params_name
+                scaled_params.append((param, grad, param * self._coeff))
+                self._params_name.add(param.name)
+        return scaled_params
+
+    def backward(self, **kargs):
+        return super(DecoupledWeightDecay, self).backward(**kargs)
+
+    def _apply_optimize(self, **kargs):
+        return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        params_grads = self.backward(
+            loss=loss,
+            startup_program=startup_program,
+            parameters=parameters,
+            no_grad_set=no_grad_set)
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+
+        optimize_ops = self._apply_optimize(
+            loss=loss,
+            params_grads=params_grads,
+            startup_program=startup_program)
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
+
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
+
+
+class AdamW(DecoupledWeightDecay, Adam):
+    """
+    The AdamW optimizer is implemented based on the AdamW Optimization 
+    in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
+    it can resolves the problem of L2 regularization failure in the Adam optimizer.
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+        
+        moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+            \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
+
+        param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
+
+
+    Args:
+        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay. The default value is 0.001.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+        weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
+            The default value is 1e-08.
+        apply_decay_param_fun (function|None): If it is not None,
+            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            will be updated. It only works when we want to specify tensors.
+            Default: None.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+    **Notes**:
+        **Currently, AdamW doesn't support sparse parameter optimization.**
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.AdamW(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 parameters=None,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=0.0,
+                 apply_decay_param_fun=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        args_dict = {
+            "learning_rate": learning_rate,
+            "parameters": parameters,
+            "beta1": beta1,
+            "beta2": beta2,
+            "epsilon": epsilon,
+            "grad_clip": grad_clip,
+            "name": name,
+            "lazy_mode": lazy_mode
+        }
+        super(AdamW, self).__init__(
+            weight_decay,
+            apply_decay_param_fun=apply_decay_param_fun,
+            **args_dict)
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b
--- /dev/null
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -0,0 +1,1430 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy
+import warnings
+from paddle import Tensor
+
+__all__ = [
+    'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
+]
+
+
+class _LRScheduler(object):
+    """LRScheduler Base class.
+
+    Define the common interface of an LRScheduler.
+    User can 'form paddle.optimizer.lr_scheduler import _LRScheduler'
+    And inherit from it to have a custom implementation of get_lr().
+    """
+
+    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of learning rate must be float, but received {}".
+                format(type(learning_rate)))
+        self.base_lr = float(learning_rate)
+        self.last_lr = float(learning_rate)
+        self.last_epoch = last_epoch
+        self.verbose = verbose
+        self._var_name = None
+
+        self.step()
+
+    def __call__(self):
+        """ 
+        Return last computed learning rate on current epoch.
+        """
+        return self.last_lr
+
+    def step(self, epoch=None):
+        """
+        'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'.  
+        The new learning rate will take effect on next epoch.
+
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        
+        Examples:
+            Please refer to the example of current _LRScheduler. 
+        """
+        if epoch is None:
+            self.last_epoch += 1
+            self.last_lr = self.get_lr()
+        else:
+            self.last_epoch = epoch
+            if hasattr(self, "_get_closed_form_lr"):
+                self.last_lr = self._get_closed_form_lr()
+            else:
+                self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def state_dict(self):
+        """
+        Returns the state of the scheduler as a :class:`dict`.
+
+        It is a subset of self.__dict__ .
+        """
+        self._state_keys()
+        state_dict = {}
+        for key in self.keys:
+            if key not in self.__dict__:
+                continue
+            value = self.__dict__[key]
+            if isinstance(value, Tensor):
+                assert value.shape == [
+                    1
+                ], "shape of Tensor in state_dict must be [1] {}".format(
+                    value.shape)
+                value = value.numpy()[0]
+            state_dict[key] = value
+
+        return state_dict
+
+    # For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default.
+    # (Note): you can change it for your subclass.
+    def _state_keys(self):
+        """
+        set the keys in self.__dict__ that are needed to be saved.
+        """
+        self.keys = ['last_epoch', 'last_lr']
+
+    def set_dict(self, state_dict):
+        """
+        Loads the schedulers state.
+        """
+        self._state_keys()
+        for key in self.keys:
+            if key in state_dict:
+                self.__dict__[key] = state_dict[key]
+            else:
+                raise RuntimeError(
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
+                    format(key))
+        if len(state_dict) > len(self.keys):
+            warnings.warn(
+                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
+            )
+
+    # alias for set_dict
+    set_state_dict = set_dict
+
+    def get_lr(self):
+        # calculate by python float
+        raise NotImplementedError
+
+
+class NoamLR(_LRScheduler):
+    """
+
+    Applies Noam Lear to the initial learning rate. 
+
+    The algorithm can be described as following.
+
+    .. math::
+
+        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})
+
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+
+
+    Args:
+        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
+        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
+        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``NoamLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 learning_rate=1.0,
+                 last_epoch=-1,
+                 verbose=False):
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+        super(NoamLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            a = 1
+        else:
+            a = self.last_epoch**-0.5
+        b = self.warmup_steps**-1.5 * self.last_epoch
+        return self.base_lr * (self.d_model**-0.5) * min(a, b)
+
+
+class PiecewiseLR(_LRScheduler):
+    """
+
+    Piecewise learning rate scheduler.
+
+    The algorithm can be described as the code below:
+
+    .. code-block:: text
+
+        boundaries = [100, 200]
+        values = [1.0, 0.5, 0.1]
+        if epoch < 100:
+            learning_rate = 1.0
+        elif 100 <= global_step < 200:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+
+    Args:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
+        values(list): A list of learning rate values that will be picked during different epoch boundaries. 
+            The type of element in the list is python float.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``PiecewiseLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
+        self.boundaries = boundaries
+        self.values = values
+        super(PiecewiseLR, self).__init__(
+            last_epoch=last_epoch, verbose=verbose)
+
+    def get_lr(self):
+
+        for i in range(len(self.boundaries)):
+            if self.last_epoch < self.boundaries[i]:
+                return self.values[i]
+        return self.values[len(self.values) - 1]
+
+
+class NaturalExpLR(_LRScheduler):
+    """
+
+    Applies natural exponential decay to the initial learning rate.
+    
+    The algorithm can be described as following:
+
+    .. math::
+
+        new\_learning\_rate = learning\_rate * e^{- gama * epoch}
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float, optional): A Ratio to update the learning rate. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``NaturalExpLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
+
+
+class InverseTimeLR(_LRScheduler):
+    """
+
+    Applies inverse time decay to the initial learning rate.
+
+    The algorithm can be described as following:
+
+    .. math::
+
+        new\_learning\_rate = \\frac{learning\_rate}{1 + gamma * epoch}
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``InverseTimeLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr / (1 + self.gamma * self.last_epoch)
+
+
+class PolynomialLR(_LRScheduler):
+    """
+
+    Applies polynomial decay to the initial learning rate.
+
+    The algorithm can be described as following.
+
+    If cycle is set to True, then:
+
+    .. math::
+
+        decay\_steps & = decay\_steps * math.ceil(\\frac{epoch}{decay\_steps}) 
+
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+
+    If cycle is set to False, then:
+
+    .. math::
+
+        epoch & = min(epoch, decay\_steps) 
+
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        decay_steps(int): The decay step size. It determines the decay cycle.
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
+        power(float, optional): Power of polynomial. Default: 1.0.
+        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease 
+            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``PolynomialLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 end_lr=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 last_epoch=-1,
+                 verbose=False):
+        self.decay_steps = decay_steps
+        self.end_lr = end_lr
+        self.power = power
+        self.cycle = cycle
+        super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        tmp_epoch_num = self.last_epoch
+        tmp_decay_steps = self.decay_steps
+        if self.cycle:
+            div_res = math.ceil(
+                float(self.last_epoch) / float(self.decay_steps))
+
+            if self.last_epoch == 0:
+                div_res = 1
+            tmp_decay_steps = self.decay_steps * div_res
+        else:
+            tmp_epoch_num = min(self.last_epoch, self.decay_steps)
+
+        return (self.base_lr - self.end_lr) * (
+            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
+             )**self.power) + self.end_lr
+
+
+class LinearLrWarmup(_LRScheduler):
+    """
+
+    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
+    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
+    
+    When epoch < warmup_steps, learning rate is updated as:
+    
+    .. code-block:: text
+    
+            lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps)
+    
+    where start_lr is the initial learning rate, and end_lr is the final learning rate;
+    
+    When epoch >= warmup_steps, learning rate is updated as:
+    
+    .. code-block:: text
+    
+            lr = learning_rate
+    
+    where lr is float or any subclass of ``_LRScheduler`` .
+
+    Args:
+        learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` .
+        warmup_steps (int): total steps of warm up.
+        start_lr (float): Initial learning rate of warm up.
+        end_lr (float): Final learning rate of warm up.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``LinearLrWarmup`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.LinearLrWarmup(
+                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup(
+                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()      
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 start_lr,
+                 end_lr,
+                 last_epoch=-1,
+                 verbose=False):
+        type_check = isinstance(learning_rate, float) or isinstance(
+            learning_rate, int) or isinstance(learning_rate, _LRScheduler)
+        if not type_check:
+            raise TypeError(
+                "the type of learning_rate should be [int, float or _LRScheduler], the current type is {}".
+                format(learning_rate))
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
+            end_lr, start_lr)
+        super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return (self.end_lr - self.start_lr) * float(
+                self.last_epoch) / float(self.warmup_steps) + self.start_lr
+        else:
+            if isinstance(self.learning_rate, _LRScheduler):
+                self.learning_rate.step()
+                return self.learning_rate()
+
+            return self.learning_rate
+
+
+class ExponentialLR(_LRScheduler):
+    """
+
+    Update learning rate by 'gamma' each epoch.
+
+    The algorithm can be described as following.
+    
+    .. math::
+
+        new\_learning\_rate = last\_learning\_rate * gamma
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``ExponentialLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * (self.gamma**self.last_epoch)
+
+
+class MultiStepLR(_LRScheduler):
+    """
+    Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        milestones = [30, 50]
+        gamma = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+        
+
+    Returns:
+        ``MultiStepLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 milestones,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.milestones = milestones
+        self.gamma = gamma
+        super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
+
+
+class StepLR(_LRScheduler):
+    """
+    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        step_size = 30
+        gamma = 0.1
+
+        learning_rate = 0.5     if epoch < 30
+        learning_rate = 0.05    if 30 <= epoch < 60
+        learning_rate = 0.005   if 60 <= epoch < 90
+        ...
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        step_size (int): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``StepLR`` instance to schedule learning rate.
+
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_size,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(step_size, int):
+            raise TypeError(
+                "The type of 'step_size' must be 'int', but received %s." %
+                type(step_size))
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.step_size = step_size
+        self.gamma = gamma
+        super(StepLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        i = self.last_epoch // self.step_size
+        return self.base_lr * (self.gamma**i)
+
+
+class LambdaLR(_LRScheduler):
+    """
+    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5        # init learning_rate
+        lr_lambda = lambda epoch: 0.95 ** epoch
+
+        learning_rate = 0.5        # epoch 0
+        learning_rate = 0.475      # epoch 1
+        learning_rate = 0.45125    # epoch 2
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    
+    Returns:
+        ``LambdaLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+
+    """
+
+    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s."
+                % type(lr_lambda))
+
+        self.lr_lambda = lr_lambda
+        super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * self.lr_lambda(self.last_epoch)
+
+
+class ReduceLROnPlateau(_LRScheduler):
+    """
+    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate 
+    by 2 to 10 times once model performance has no longer improvement.
+
+    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics`` 
+    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` . 
+    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience`` 
+    number of epochs, the learning rate will be reduced.)
+
+    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the 
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning 
+            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` . 
+            It should be less than 1.0. Default: 0.1.
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. 
+            Default: 10.
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . 
+            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
+        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum 
+            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
+        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
+        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, 
+            the update is ignored. Default: 1e-8.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
+
+    
+    Returns:
+        ``ReduceLROnPlateau`` instance to schedule learning rate.
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step(loss)
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step(out[0])
+
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 mode='min',
+                 factor=0.1,
+                 patience=10,
+                 threshold=1e-4,
+                 threshold_mode='rel',
+                 cooldown=0,
+                 min_lr=0,
+                 epsilon=1e-8,
+                 verbose=False):
+        mode = mode.lower()
+        if mode not in ['min', 'max']:
+            raise ValueError('mode: ' + mode + ' is unknown!')
+        self.mode = mode
+
+        if factor >= 1.0:
+            raise ValueError(
+                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
+        self.factor = factor
+
+        threshold_mode = threshold_mode.lower()
+        if threshold_mode not in ['rel', 'abs']:
+            raise ValueError('threshold mode: ' + threshold_mode +
+                             ' is unknown!')
+        self.threshold_mode = threshold_mode
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s."
+                % type(learning_rate))
+
+        self.verbose = verbose
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.min_lr = min_lr
+        self.epsilon = epsilon
+
+        self.cooldown_counter = 0
+        self.best = None
+        self.num_bad_epochs = 0
+
+        # Can not call Parent __init__, so implement here.
+        self.base_lr = float(learning_rate)
+        self.last_lr = float(learning_rate)
+        self.last_epoch = 0
+        self.verbose = verbose
+        self._var_name = None
+
+    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
+    def _state_keys(self):
+        self.keys = [
+            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
+            'last_lr'
+        ]
+
+    def step(self, metrics, epoch=None):
+        """
+        step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` .  
+        The new learning rate will take effect on next epoch.
+
+        Args:
+            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce. 
+                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
+                'numpy.ndarray', its shape must be [1].
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        
+        Examples:
+            Please refer to the example of current _LRScheduler.
+        """
+        if epoch is None:
+            self.last_epoch = self.last_epoch + 1
+        else:
+            self.last_epoch = epoch
+
+        # loss must be 1-D Tensor with shape [1]
+        if isinstance(metrics, (Tensor, numpy.ndarray)):
+            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
+                "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
+                "you should call paddle.mean to process it first.".format(loss.shape)
+        elif not isinstance(metrics,
+                            (int, float, numpy.float32, numpy.float64)):
+            raise TypeError(
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
+                format(type(metrics)))
+
+        if self.cooldown_counter > 0:
+            self.cooldown_counter -= 1
+        else:
+            if self.best is None or self._is_better(metrics, self.best):
+                self.best = metrics
+                self.num_bad_epochs = 0
+            else:
+                self.num_bad_epochs += 1
+
+            if self.num_bad_epochs > self.patience:
+                self.cooldown_counter = self.cooldown
+                self.num_bad_epochs = 0
+                new_lr = max(self.last_lr * self.factor, self.min_lr)
+                if self.last_lr - new_lr > self.epsilon:
+                    self.last_lr = new_lr
+                    if self.verbose:
+                        print('Epoch {}: {} set learning rate to {}.'.format(
+                            self.last_epoch, self.__class__.__name__,
+                            self.last_lr))
+
+    def _is_better(self, current, best):
+        print("mode", self.mode, 'threshold_mode', self.threshold_mode)
+        if self.mode == 'min' and self.threshold_mode == 'rel':
+            return current < best - best * self.threshold
+
+        elif self.mode == 'min' and self.threshold_mode == 'abs':
+            return current < best - self.threshold
+
+        elif self.mode == 'max' and self.threshold_mode == 'rel':
+            return current > best + best * self.threshold
+
+        else:
+            return current > best + self.threshold
+
+
+class CosineAnnealingLR(_LRScheduler):
+    """
+
+    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
+    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
+    SGDR:
+
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    The algorithm can be described as following.
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+    
+    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_. 
+    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
+    
+    Args:
+        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
+        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate.
+        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``CosineAnnealingLR`` instance to schedule learning rate.
+
+    Examples:
+        
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dygraph mode
+            paddle.disable_static()
+            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(2):
+                    x = paddle.to_tensor(x)
+                    out = linear(x)
+                    loss = paddle.reduce_mean(out)
+                    loss.backward()
+                    sgd.minimize(loss)
+                    linear.clear_gradients()
+                scheduler.step()
+
+            # train on static mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(2):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                scheduler.step()
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 T_max,
+                 eta_min=0,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(T_max, int):
+            raise TypeError(
+                "The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s."
+                % type(T_max))
+        if not isinstance(eta_min, (float, int)):
+            raise TypeError(
+                "The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s."
+                % type(eta_min))
+        self.T_max = T_max
+        self.eta_min = float(eta_min)
+        super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
+                math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
+                self.last_lr - self.eta_min) + self.eta_min
+
+    def _get_closed_form_lr(self):
+        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
+            math.pi * self.last_epoch / self.T_max)) / 2
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb602ff0b3754f82fb2ef9d8b78de18e46d56778
--- /dev/null
+++ b/python/paddle/optimizer/optimizer.py
@@ -0,0 +1,998 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import six
+import logging
+from collections import defaultdict
+
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+import paddle
+
+from ..fluid import framework
+from ..fluid import layers
+from ..fluid import unique_name
+from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
+from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
+from ..fluid.framework import program_guard
+from ..fluid.initializer import Constant
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.layers import ops
+from ..fluid.regularizer import append_regularization_ops
+from ..fluid.dygraph import base as imperative_base
+from ..fluid.dygraph import no_grad
+from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
+from paddle.fluid import core
+from paddle.fluid.layers import tensor
+from functools import reduce
+from ..fluid.wrapped_decorator import signature_safe_contextmanager
+from .. import compat as cpt
+
+__all__ = ['Optimizer']
+
+
+class Optimizer(object):
+    """Optimizer Base class.
+
+    Define the common interface of an optimizer.
+    User should not use this class directly,
+    but need to use one of it's implementation.
+
+    Args:
+        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay.
+        parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
+            some derived class of ``GradientClipBase`` . There are three cliping strategies \
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    Returns:
+       Base class for optimizer. 
+    
+    Examples:
+        .. code-block:: python
+
+            #Take the subclass adam as an example
+            #Optimizer 
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    @imperative_base.no_grad()
+    def __init__(self,
+                 learning_rate,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        self._parameter_list = list(
+            parameters) if parameters is not None else None
+        self._name = name
+        if framework.in_dygraph_mode():
+            if not isinstance(learning_rate, float) and \
+                    not isinstance(learning_rate, LearningRateDecay):
+                raise TypeError(
+                    "learning rate should be float or LearningRateDecay, got %s here"
+                    % type(learning_rate))
+            if self._parameter_list is None:
+                raise AttributeError(
+                    "parameters argument given to the Optimizer should not be None in dygraph mode."
+                )
+            if weight_decay is not None:
+                for param in self._parameter_list:
+                    if param.regularizer is not None:
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
+                            "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % weight_decay.__str__())
+                        break
+        else:
+            if not isinstance(learning_rate, float) and \
+                    not isinstance(learning_rate, framework.Variable):
+                raise TypeError(
+                    "learning rate should be float or Tensor, got %s here" %
+                    type(learning_rate))
+
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+        if isinstance(weight_decay, float):
+            from ..fluid.regularizer import L2Decay
+            self.regularization = L2Decay(weight_decay)
+        else:
+            self.regularization = weight_decay
+        self._grad_clip = grad_clip
+        self._learning_rate = learning_rate
+        # the learning rate type should be inferenced from loss
+        self._dtype = None
+        # each program should have a independent learning rate
+        # program -> tensor(learning_rate)
+        self._learning_rate_map = dict()
+        if isinstance(self._learning_rate, framework.Variable):
+            self._learning_rate_map[framework.default_main_program(
+            )] = self._learning_rate
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra tensors associated with the parameters
+        # to train. These tensors are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+        self._opti_name_list = []
+        self._accumulators_holder = {}
+        self._param_device_map = dict()
+        self.clear_gradients = self.clear_grad
+
+    @framework.dygraph_only
+    def state_dict(self):
+        '''
+        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
+        If the optimizer never be called(minimize function), the state_dict is empty.
+
+        Args: 
+            None
+
+        Returns:
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+
+                adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+                state_dict = adam.state_dict()
+
+        '''
+        state_dict = {}
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                state_dict[var_tmp.name] = var_tmp
+        # global step if use lr decay
+        if isinstance(self._learning_rate, LearningRateDecay):
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+
+            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
+                var_tmp = None
+                var_temp = framework._varbase_creator(
+                    None, name='global_step', dtype='int32')
+
+                tensor.fill_constant(
+                    [1], "int32", self._learning_rate.step_num, out=var_temp)
+
+                state_dict['global_step'] = var_temp
+        return state_dict
+
+    @framework.dygraph_only
+    def set_state_dict(self, state_dict):
+        '''
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
+
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+        Return:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+
+                state_dict = emb.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+
+                adam = paddle.optimizer.Adam(learning_rate=paddle.nn.functional.noam_decay( 100, 10000), 
+                                            parameters=emb.parameters())
+                state_dict = adam.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+
+                adam.set_state_dict(opti_state_dict)
+
+        '''
+
+        if isinstance(self._learning_rate, LearningRateDecay):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
+
+            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
+                assert 'global_step' in state_dict, \
+                        'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
+                global_step = state_dict['global_step']
+
+                if isinstance(global_step, Variable):
+                    step_np = global_step
+                    step_np = np.array(step_np.value().get_tensor())
+                    assert step_np.shape == (1,),  \
+                            "global step shape is (1,), the shape is {}".format( step_np.shape )
+
+                    self._learning_rate.step_num = int(step_np[0])
+                elif isinstance(global_step, np.ndarray):
+                    assert global_step.shape == (1,),  \
+                            "global step shape is (1,), the shape is {}".format( global_step.shape )
+                    self._learning_rate.step_num = global_step[0]
+                else:
+                    raise RuntimeError(
+                        "Type not supprt, value in state dict must be [VarBase, Tensor, numpy], the type is ",
+                        type(global_step))
+
+        self._accumulators_holder = state_dict
+        for k, v in self._accumulators.items():
+            for para_name, var_tmp in v.items():
+                assert var_tmp.name in state_dict, \
+                        "optimizer Tensor {} not found".format( var_tmp.name )
+                var = var_tmp.value()
+                tensor = var.get_tensor()
+                model_np = np.array(tensor)
+
+                load_para = state_dict[var_tmp.name]
+
+                if isinstance(load_para, Variable):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, core.VarBase):
+                    load_para_np = load_para.numpy()
+                elif isinstance(load_para, np.ndarray):
+                    load_para_np = load_para
+                else:
+                    raise RuntimeError("State dict type {} not supprt".format(
+                        str(type(load_para))))
+
+                assert model_np.shape == load_para_np.shape,  \
+                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                                                 item.name, model_np.shape, load_para_np.shape)
+
+                assert model_np.dtype == load_para_np.dtype, \
+                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                                                item.name, model_np.dtype, load_para_np.dtype)
+
+                tensor.set(load_para_np, framework._current_expected_place())
+
+    def get_opti_var_name_list(self):
+        return self._opti_name_list
+
+    def _create_global_learning_rate(self):
+        if imperative_base.enabled():
+            # create learning rate tensor
+            if isinstance(self._learning_rate, float):
+                lr = self._global_learning_rate()
+
+                if isinstance(lr, framework.Variable):
+                    return
+                else:
+                    self._learning_rate_map[framework.default_main_program(
+                    )] = layers.create_global_var(
+                        name=unique_name.generate("learning_rate"),
+                        shape=[1],
+                        value=float(self._learning_rate),
+                        dtype=paddle.get_default_dtype()
+                        if self._dtype is None else self._dtype,
+                        persistable=True)
+            # get learning rate Tensor from LearningRateDecay
+            elif isinstance(self._learning_rate, LearningRateDecay):
+                self._learning_rate_map[framework.default_main_program(
+                )] = self._learning_rate()
+            else:
+                raise TypeError(
+                    "optimizer's learning rate must be float or LearningRateDecay"
+                )
+        else:
+            lr = self._global_learning_rate()
+
+            if isinstance(lr, framework.Variable):
+                return
+            else:
+                if not isinstance(self._learning_rate, float):
+                    raise TypeError(
+                        "learning rate Tensor is create outside optimizer,"
+                        "can not create new learning rate Tensor for new program"
+                    )
+
+            # create learning rate in the current main program
+            self._learning_rate_map[framework.default_main_program(
+            )] = layers.create_global_var(
+                name=unique_name.generate("learning_rate"),
+                shape=[1],
+                value=float(self._learning_rate),
+                dtype=paddle.get_default_dtype()
+                if self._dtype is None else self._dtype,
+                persistable=True)
+
+    @framework.dygraph_only
+    def set_lr(self, value):
+        """
+        :api_attr: imperative
+        
+        Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
+        this API cannot be invoked, because it will lead to conflict.
+
+        Args:
+            value (float|Tensor): the value of learning rate
+
+        Returns:
+            None
+          
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                paddle.disable_static()
+                linear = paddle.nn.Linear(10, 10)
+
+                adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
+
+                # set learning rate manually by python float value
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                for i in range(5):
+                    adam.set_lr(lr_list[i])
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                # Print:
+                #    current lr is 0.2
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
+
+
+                    # set learning rate manually by framework Tensor
+                    lr_var = paddle.create_global_var(
+                        shape=[1], value=0.7, dtype='float32')
+                    adam.set_lr(lr_var)
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                    # Print:
+                    #    current lr is 0.7
+
+
+
+        """
+        if not isinstance(value, (framework.Variable, float)):
+            raise TypeError(
+                "The type of 'value' in optimizer.set_lr must be (float, Tensor), but received %s."
+                % (type(value)))
+        if isinstance(self._learning_rate, LearningRateDecay):
+            raise RuntimeError(
+                "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict."
+            )
+        if isinstance(value, float):
+            self._learning_rate = value
+            current_lr = self._global_learning_rate()
+            if current_lr is not None:
+                global_block = framework.default_main_program().global_block()
+                global_block.append_op(
+                    type='fill_constant',
+                    outputs={'Out': [current_lr]},
+                    attrs={
+                        'dtype': current_lr.dtype,
+                        'shape': list(current_lr.shape),
+                        'value': float(value)
+                    },
+                    stop_gradient=True)
+        else:
+            assert len(value.shape) == 1 and value.shape[
+                0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]"
+            self._learning_rate_map[framework.default_main_program()] = value
+
+    @framework.dygraph_only
+    def get_lr(self):
+        """
+        :api_attr: imperative
+        
+        Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
+        otherwise return the step learning rate.
+
+        Returns:
+            float: The learning rate of the current step.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                # example1: LearningRateDecay is not used, return value is all the same
+                paddle.disable_static()
+                emb = paddle.nn.Embedding([10, 10])
+                adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
+                lr = adam.get_lr()
+                print(lr) # 0.001
+
+                # example2: PiecewiseDecay is used, return the step learning rate
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.reduce_mean(out)
+                
+                bd = [2, 4, 6, 8]
+                value = [0.2, 0.4, 0.6, 0.8, 1.0]
+                adam = paddle.optimizer.Adam(paddle.PiecewiseDecay(bd, value, 0),
+                                       parameters=linear.parameters())
+
+                # first step: learning rate is 0.2
+                np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True
+
+                # learning rate for different steps
+                ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
+                for i in range(12):
+                    adam.step()
+                    lr = adam.get_lr()
+                    np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True
+
+        """
+        current_lr = self._global_learning_rate()
+        if isinstance(current_lr, framework.Variable):
+            return self._global_learning_rate().numpy()[0]
+
+        if isinstance(self._learning_rate, float):
+            return self._learning_rate
+        elif isinstance(self._learning_rate, _LearningRateEpochDecay):
+            step_lr = self._learning_rate()
+            return step_lr.numpy()[0]
+        else:
+            step_lr = self._learning_rate.step()
+            if isinstance(step_lr, (float, int)):
+                return step_lr
+            else:
+                return step_lr.numpy()[0]
+
+    def _global_learning_rate(self, program=None):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        if program is None:
+            program = framework.default_main_program()
+        return self._learning_rate_map.get(program, None)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        """ append optimize operator to block and return all the added optimize_op
+        """
+        raise NotImplementedError(
+            "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
+        )
+
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate tensor for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        if type(param_lr) == Variable:
+            return param_lr
+        else:
+            if param_lr == 1.0:
+                return self._global_learning_rate()
+            else:
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
+                    return self._global_learning_rate() * param_lr
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+        """
+        pass
+
+    def _finish_update(self, block, parameters_and_grads):
+        """Finish any custom updates needed
+           before completing an optimization step
+
+        Args:
+            block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+
+        Returns:
+            None
+        """
+        pass
+
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None,
+                         type=None,
+                         device=None):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss tensor is present
+            name: name of the accumulator
+            param: parameter tensor for which accumulator is to be added
+            dtype: data type of the accumulator tensor
+            fill_value: value to initialize the accumulator tensor
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            if framework.in_dygraph_mode():
+                return self._accumulators[name][param.name]
+            raise Exception("Accumulator {} already exists for parameter {}".
+                            format(name, param.name))
+        if shape == None:
+            shape = param.shape
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_" + name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
+        var = self.helper.create_global_variable(
+            name=var_name,
+            persistable=True,
+            dtype=dtype or param.dtype,
+            type=param.type if type is None else type,
+            shape=shape,
+            belong_to_optimizer=True)
+        if device is None:
+            device = self._get_device_for_param(param.name)
+        with device_guard(device):
+            self.helper.set_variable_initializer(
+                var, initializer=Constant(value=float(fill_value)))
+
+        if framework.in_dygraph_mode():
+            if len(self._accumulators_holder) > 0:
+                assert var_name in self._accumulators_holder, \
+                        "Optimizer set error, {} should in state dict".format( var_name )
+                var.set_value(self._accumulators_holder[var_name])
+
+        self._accumulators[name][param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter tensor for which accumulator is to be fetched
+
+        Returns:
+            accumulator tensor for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def _update_param_device_map(self, parameters_and_grads, target_block):
+        for param_and_grad in parameters_and_grads:
+            if param_and_grad[0].trainable is True:
+                param_name = param_and_grad[0].name
+                ops = target_block.ops
+                device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+                )
+                for op in ops:
+                    input_arg_names = op.input_arg_names
+                    if param_name in input_arg_names:
+                        self._param_device_map[param_name] = op.attr(
+                            device_attr_name)
+                        break
+
+    def _get_device_for_param(self, param_name):
+        device = None
+        if param_name in self._param_device_map:
+            device = self._param_device_map[param_name]
+        return device
+
+    def _create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to tensors.
+
+        Args:
+          parameters_and_grads(list(tuple(Tensor, Tensor))):
+            a list of (tensor, gradient) pair to update.
+
+        Returns:
+          return_op_list: a list of operators that will complete one step of
+            optimization. This will include parameter update ops, global step
+            update ops and any other custom ops required by subclasses to manage
+            their internal state.
+        """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters and extend _finish_update method to add custom ops.
+
+        # Allways called under program_guard use global block as loss block
+        # But if current block is in control flow, append optimize op in the
+        # grad block of current block
+
+        global_block = framework.default_main_program().global_block()
+        target_block = global_block
+        current_block = framework.default_main_program().current_block()
+        if current_block.idx != global_block.idx:
+            assert current_block.backward_block_idx != -1, \
+                "current block is not global_block, but it doesn't have backward block."
+            target_block = framework.default_main_program().blocks[
+                current_block.backward_block_idx]
+
+        start = len(target_block.ops)
+        self.helper = LayerHelper(self.__class__.__name__)
+        self._update_param_device_map(parameters_and_grads, target_block)
+        self._create_accumulators(
+            target_block,
+            [p[0] for p in parameters_and_grads if p[0].trainable])
+        self._create_global_learning_rate()
+
+        if framework.in_dygraph_mode():
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].trainable is True:
+                    self._append_optimize_op(target_block, param_and_grad)
+        else:
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad), name_scope("optimizer"):
+                    if param_and_grad[0].trainable is True:
+                        device = self._get_device_for_param(param_and_grad[0]
+                                                            .name)
+                        with device_guard(device):
+                            optimize_op = self._append_optimize_op(
+                                target_block, param_and_grad)
+
+        # Get custom finish ops for subclasses
+        # FIXME: Need to fix this once we figure out how to handle dependencies
+        self._finish_update(target_block, parameters_and_grads)
+
+        end = len(target_block.ops)
+        return target_block._slice_ops(start, end)
+
+    def _append_dgc_ops(self, param_and_grad):
+        pass
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        The first part of ``minimize``, do auto-diff to append backward operations for
+        the current program.
+
+        Args:
+            loss (Tensor): ``loss`` tensor to run optimizations.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameters``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+                to be updated. The default value is None.
+            callbacks (list, optional): list of callable objects to run when appending backward
+                operator for one parameter. The default value is None.
+
+        Return:
+            list: list of (param, grad) tensor pairs, param is ``Parameter``,
+                grad is the gradient value corresponding to the parameter.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        act_no_grad_set = None
+        if framework.in_dygraph_mode():
+            pass
+        else:
+            act_no_grad_set = self._get_no_grad_set(loss, no_grad_set)
+
+        self._dtype = loss.dtype
+        if framework.in_dygraph_mode():
+            params_grads = []
+            for param in self._parameter_list:
+                if not param.trainable:
+                    continue
+                if param._grad_ivar() is not None:
+                    # create gradient tensor
+                    grad_var = param._grad_ivar()
+                    params_grads.append((param, grad_var))
+        else:
+            if callbacks is None:
+                callbacks = [error_clip_callback]
+            else:
+                assert (isinstance(callbacks, list))
+            program = loss.block.program
+            assert len(loss.shape) == 1 and loss.shape[0] == 1, \
+                "The loss.shape should be (1L,), but the current loss.shape is {}. " \
+                "Maybe that you should call paddle.mean to process the current loss.".format(
+                    loss.shape)
+            parameter_list = parameters if parameters \
+                else self._parameter_list
+            with program_guard(program, startup_program):
+                params_grads = append_backward(loss, parameter_list,
+                                               act_no_grad_set, callbacks)
+                # Note: since we can't use all_reduce_op now,
+                #  dgc_op should be the last op of one grad.
+                self._append_dgc_ops(params_grads)
+        return params_grads
+
+    def apply_gradients(self, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+
+        Args:
+            params_grads (list): list of (param, grad) pair to do optimization.
+
+        Returns:
+            list: A list of operators appended to the current program.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.mean(out)
+                optimizer = paddle.optimizer.Adam(learning_rate=0.1,
+                        parameters=linear.parameters())
+                params_grads = optimizer.backward(loss)
+                optimizer.apply_gradients(params_grads)
+
+        """
+
+        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+
+        # 'optimizer(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            params_grads = self._grad_clip(params_grads)
+        else:
+
+            params_grads = append_gradient_clip_ops(params_grads)
+
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
+
+        optimize_ops = self._create_optimization_pass(params_grads)
+        return optimize_ops
+
+    def _apply_optimize(self, loss, startup_program, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+        Args:
+            loss (Tensor): loss tensor to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameters`.
+            params_grads (list): list of (param, grad) pair to do optimization.
+        Returns:
+            list: A list of operators appended to the current program.
+        """
+        if framework.in_dygraph_mode():
+            with program_guard(framework.default_main_program(),
+                               framework.default_startup_program()):
+                if self._grad_clip is not None:
+                    params_grads = self._grad_clip(params_grads)
+                params_grads = append_regularization_ops(params_grads,
+                                                         self.regularization)
+                optimize_ops = self._create_optimization_pass(params_grads)
+        else:
+            program = loss.block.program
+            with program_guard(program, startup_program):
+                optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
+    def _get_no_grad_set(self, loss, no_grad_set=None):
+        no_grad_set = _get_no_grad_set_name(no_grad_set)
+        parameters = loss.block.program.global_block().all_parameters()
+        param_no_trainable = set(
+            [param.name for param in parameters if param.trainable is False])
+        # If the parameter is no trainable, it should not have a gradient.
+        no_grad_set.update(param_no_trainable)
+
+        return no_grad_set
+
+    @framework.dygraph_only
+    def clear_grad(self):
+        """
+        Clear the gradients of all optimized parameters for model.
+        
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+
+        """
+        for p in self._parameter_list:
+            if p.trainable:
+                p.clear_gradient()
+
+    @imperative_base.no_grad()
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        """
+        Add operations to minimize ``loss`` by updating ``parameters``.
+
+        Args:
+            loss (Tensor): A ``Tensor`` containing the value to minimize.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameters``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+                to be updated. The default value is None.
+
+        Returns:
+            tuple: tuple (optimize_ops, params_grads), A list of operators appended
+            by minimize and a list of (param, grad) tensor pairs, param is
+            ``Parameter``, grad is the gradient value corresponding to the parameter.
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
+            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            ``fetch_list`` before run, see details in ``Executor``.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.fluid as fluid
+
+                place = fluid.CPUPlace()
+                main = fluid.Program()
+                with fluid.program_guard(main):
+                    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+                    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
+                    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+                    avg_cost = fluid.layers.mean(cost)
+
+                    adam_optimizer = paddle.optimizer.Adam(0.01)
+                    adam_optimizer.minimize(avg_cost)
+
+                    fetch_list = [avg_cost]
+                    train_reader = paddle.batch(
+                        paddle.dataset.uci_housing.train(), batch_size=1)
+                    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+                    exe = fluid.Executor(place)
+                    exe.run(fluid.default_startup_program())
+                    for data in train_reader():
+                        exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+        """
+        assert isinstance(loss, Variable), "The loss should be an Tensor."
+
+        parameter_list = parameters if parameters \
+            else self._parameter_list
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameters=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer once.
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc4c9bfd53dc15449f03d6de6c8942e977bf562
--- /dev/null
+++ b/python/paddle/optimizer/rmsprop.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["RMSProp"]
+
+
+class RMSProp(Optimizer):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
+
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+
+    Parameters:
+        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
+            It can be a float value or a LearningRateDecay.
+        rho(float): rho is :math: `\\rho` in equation, default is 0.95.
+        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
+            avoid division by zero, default is 1e-6.
+        momentum(float): :math:`\\beta` in equation is the momentum term,
+            default is 0.0.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
+	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
+                    parameters=linear.parameters(),
+                    weight_decay=0.01)
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
+    """
+
+    _momentum_acc_str = "momentum"
+    _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
+
+    def __init__(self,
+                 learning_rate,
+                 rho=0.95,
+                 epsilon=1.0e-6,
+                 momentum=0.0,
+                 centered=False,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if momentum is None:
+            raise ValueError("momentum is not set.")
+
+        super(RMSProp, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+
+        self.type = "rmsprop"
+        self._rho = rho
+        self._epsilon = epsilon
+        self._momentum = momentum
+        self._centered = centered
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._momentum_acc_str, p)
+            self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        momentum_acc = self._get_accumulator(self._momentum_acc_str,
+                                             param_and_grad[0])
+        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
+                                                param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum,
+                "centered": self._centered
+            },
+            stop_gradient=True)
+
+        return rmsprop_op
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a28a4f04e368cf8a1c1a144639bc743234a540
--- /dev/null
+++ b/python/paddle/static/__init__.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: import framework api under this directory 
+__all__ = [
+    'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
+    'BuildStrategy', 'CompiledProgram', 'Print', 'py_func', 'ExecutionStrategy',
+    'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
+    'default_main_program', 'default_startup_program', 'Program', 'save',
+    'load', 'data', 'InputSpec'
+]
+
+from . import nn
+from .input import data  #DEFINE_ALIAS
+from .input import InputSpec  #DEFINE_ALIAS
+from ..fluid.executor import Executor  #DEFINE_ALIAS
+from ..fluid.executor import global_scope  #DEFINE_ALIAS
+from ..fluid.executor import scope_guard  #DEFINE_ALIAS
+from ..fluid.backward import append_backward  #DEFINE_ALIAS
+from ..fluid.backward import gradients  #DEFINE_ALIAS
+from ..fluid.compiler import BuildStrategy  #DEFINE_ALIAS
+from ..fluid.compiler import CompiledProgram  #DEFINE_ALIAS
+from ..fluid.compiler import ExecutionStrategy  #DEFINE_ALIAS
+from ..fluid.framework import default_main_program  #DEFINE_ALIAS
+from ..fluid.framework import default_startup_program  #DEFINE_ALIAS
+from ..fluid.framework import Program  #DEFINE_ALIAS
+from ..fluid.framework import name_scope  #DEFINE_ALIAS
+from ..fluid.framework import program_guard  #DEFINE_ALIAS
+from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
+from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
+from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
+from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
+from ..tensor.io import save  #DEFINE_ALIAS
+from ..tensor.io import load  #DEFINE_ALIAS
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b9c7cdbef5dd11d237a2b85586e598611bf83e
--- /dev/null
+++ b/python/paddle/static/input.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import six
+
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import check_dtype, check_type
+
+__all__ = ['data', 'InputSpec']
+
+
+def data(name, shape, dtype=None, lod_level=0):
+    """
+    **Data Layer**
+
+    This function creates a variable on the global block. The global variable
+    can be accessed by all the following operators in the graph. The variable
+    is a placeholder that could be fed with input, such as Executor can feed
+    input into the variable. When `dtype` is None, the dtype
+    will get from the global dtype by `paddle.get_default_dtype()`.
+
+    Args:
+       name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+           for more details.
+       shape (list|tuple): List|Tuple of integers declaring the shape. You can
+           set "None" or -1 at a dimension to indicate the dimension can be of any
+           size. For example, it is useful to set changeable batch size as "None" or -1.
+       dtype (np.dtype|str, optional): The type of the data. Supported
+           dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+           uint8. Default: None. When `dtype` is not set, the dtype will get 
+           from the global dtype by `paddle.get_default_dtype()`.
+       lod_level (int, optional): The LoD level of the LoDTensor. Usually users
+           don't have to set this value. For more details about when and how to
+           use LoD level, see :ref:`user_guide_lod_tensor` . Default: 0.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid as fluid
+          import paddle
+
+          # Creates a variable with fixed size [3, 2, 1]
+          # User can only feed data of the same shape to x
+          # the dtype is not set, so it will set "float32" by
+          # paddle.get_default_dtype(). You can use paddle.get_default_dtype() to 
+          # change the global dtype
+          x = paddle.static.data(name='x', shape=[3, 2, 1])
+
+          # Creates a variable with changeable batch size -1.
+          # Users can feed data of any batch size into y,
+          # but size of each data sample has to be [2, 1]
+          y = paddle.static.data(name='y', shape=[-1, 2, 1], dtype='float32')
+
+          z = x + y
+
+          # In this example, we will feed x and y with np-ndarray "1"
+          # and fetch z, like implementing "1 + 1 = 2" in PaddlePaddle
+          feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
+
+          exe = fluid.Executor(fluid.CPUPlace())
+          out = exe.run(fluid.default_main_program(),
+                        feed={
+                            'x': feed_data,
+                            'y': feed_data
+                        },
+                        fetch_list=[z.name])
+
+          # np-ndarray of shape=[3, 2, 1], dtype=float32, whose elements are 2
+          print(out)
+
+    """
+    helper = LayerHelper('data', **locals())
+    check_type(name, 'name', (six.binary_type, six.text_type), 'data')
+    check_type(shape, 'shape', (list, tuple), 'data')
+
+    shape = list(shape)
+    for i in six.moves.range(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+
+    if dtype:
+        return helper.create_global_variable(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            stop_gradient=True,
+            lod_level=lod_level,
+            is_data=True,
+            need_check_feed=True)
+    else:
+        return helper.create_global_variable(
+            name=name,
+            shape=shape,
+            dtype=paddle.get_default_dtype(),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            stop_gradient=True,
+            lod_level=lod_level,
+            is_data=True,
+            need_check_feed=True)
+
+
+class InputSpec(object):
+    """
+    Define input specification of the model.
+
+    Args:
+        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+            for more details.
+        shape (tuple(integers)|list[integers]): List|Tuple of integers
+            declaring the shape. You can set "None" or -1 at a dimension
+            to indicate the dimension can be of any size. For example,
+            it is useful to set changeable batch size as "None" or -1.
+        dtype (np.dtype|str, optional): The type of the data. Supported
+            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+            uint8. Default: float32.
+
+    Examples:
+        .. code-block:: python
+
+        from paddle.static import InputSpec
+
+        input = InputSpec([None, 784], 'float32', 'x')
+        label = InputSpec([None, 1], 'int64', 'label')
+    """
+
+    def __init__(self, shape=None, dtype='float32', name=None):
+        self.shape = shape
+        self.dtype = dtype
+        self.name = name
+
+    def _create_feed_layer(self):
+        return data(self.name, shape=self.shape, dtype=self.dtype)
+
+    def __repr__(self):
+        return '{}(shape={}, dtype={}, name={})'.format(
+            type(self).__name__, self.shape, self.dtype, self.name)
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91da0926b1870bb4a7999e62965c135dcf36bf25
--- /dev/null
+++ b/python/paddle/static/nn/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'fc',
+    'batch_norm',
+    'embedding',
+    'bilinear_tensor_product',
+    'conv2d',
+    'conv2d_transpose',
+    'conv3d',
+    'conv3d_transpose',
+    'create_parameter',
+    'crf_decoding',
+    'data_norm',
+    'deformable_conv',
+    'group_norm',
+    'hsigmoid',
+    'instance_norm',
+    'layer_norm',
+    'multi_box_head',
+    'nce',
+    'prelu',
+    'row_conv',
+    'spectral_norm',
+]
+
+from ...fluid.layers import fc  #DEFINE_ALIAS
+from ...fluid.layers import batch_norm  #DEFINE_ALIAS
+from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
+from ...fluid.layers import conv2d  #DEFINE_ALIAS
+from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
+from ...fluid.layers import conv3d  #DEFINE_ALIAS
+from ...fluid.layers import conv3d_transpose  #DEFINE_ALIAS
+from ...fluid.layers import create_parameter  #DEFINE_ALIAS
+from ...fluid.layers import crf_decoding  #DEFINE_ALIAS
+from ...fluid.layers import data_norm  #DEFINE_ALIAS
+from ...fluid.layers import deformable_conv  #DEFINE_ALIAS
+from ...fluid.layers import group_norm  #DEFINE_ALIAS
+from ...fluid.layers import hsigmoid  #DEFINE_ALIAS
+from ...fluid.layers import instance_norm  #DEFINE_ALIAS
+from ...fluid.layers import layer_norm  #DEFINE_ALIAS
+from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
+from ...fluid.layers import nce  #DEFINE_ALIAS
+from ...fluid.layers import prelu  #DEFINE_ALIAS
+from ...fluid.layers import row_conv  #DEFINE_ALIAS
+from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
+
+from ...fluid.input import embedding  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 21cae803716a9af8cd040c47f147a02093b21137..0fed32a1676759bd94961af0a8949d035ec48c8f 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -22,9 +22,7 @@ from __future__ import print_function
 from .random import randperm
 from .attribute import rank  #DEFINE_ALIAS
 from .attribute import shape  #DEFINE_ALIAS
-from .creation import create_tensor  #DEFINE_ALIAS
-# from .creation import create_lod_tensor        #DEFINE_ALIAS
-# from .creation import create_random_int_lodtensor        #DEFINE_ALIAS
+from .creation import to_tensor  #DEFINE_ALIAS
 from .creation import crop_tensor  #DEFINE_ALIAS
 from .creation import diag  #DEFINE_ALIAS
 from .creation import eye  #DEFINE_ALIAS
@@ -60,7 +58,7 @@ from .logic import equal  #DEFINE_ALIAS
 from .logic import greater_equal  #DEFINE_ALIAS
 from .logic import greater_than  #DEFINE_ALIAS
 from .logic import is_empty  #DEFINE_ALIAS
-from .logic import isfinite  #DEFINE_ALIAS
+#from .logic import isfinite  #DEFINE_ALIAS
 from .logic import less_equal  #DEFINE_ALIAS
 from .logic import less_than  #DEFINE_ALIAS
 from .logic import logical_and  #DEFINE_ALIAS
@@ -76,7 +74,9 @@ from .logic import equal_all  #DEFINE_ALIAS
 from .manipulation import cast  #DEFINE_ALIAS
 from .manipulation import concat  #DEFINE_ALIAS
 from .manipulation import expand  #DEFINE_ALIAS
+from .manipulation import broadcast_to  #DEFINE_ALIAS
 from .manipulation import expand_as  #DEFINE_ALIAS
+from .manipulation import tile  #DEFINE_ALIAS
 from .manipulation import flatten  #DEFINE_ALIAS
 from .manipulation import gather  #DEFINE_ALIAS
 from .manipulation import gather_nd  #DEFINE_ALIAS
@@ -99,6 +99,7 @@ from .manipulation import unstack  #DEFINE_ALIAS
 from .manipulation import flip  #DEFINE_ALIAS
 from .manipulation import unbind  #DEFINE_ALIAS
 from .manipulation import roll  #DEFINE_ALIAS
+from .manipulation import chunk  #DEFINE_ALIAS
 from .math import abs  #DEFINE_ALIAS
 from .math import acos  #DEFINE_ALIAS
 from .math import asin  #DEFINE_ALIAS
@@ -110,8 +111,7 @@ from .math import cumsum  #DEFINE_ALIAS
 from .math import elementwise_add  #DEFINE_ALIAS
 from .math import elementwise_div  #DEFINE_ALIAS
 from .math import elementwise_floordiv  #DEFINE_ALIAS
-from .math import elementwise_max  #DEFINE_ALIAS
-from .math import elementwise_min  #DEFINE_ALIAS
+from .math import elementwise_mul  #DEFINE_ALIAS
 from .math import elementwise_mod  #DEFINE_ALIAS
 from .math import elementwise_pow  #DEFINE_ALIAS
 from .math import elementwise_sub  #DEFINE_ALIAS
@@ -140,9 +140,15 @@ from .math import sums  #DEFINE_ALIAS
 from .math import tanh  #DEFINE_ALIAS
 from .math import elementwise_sum  #DEFINE_ALIAS
 from .math import max  #DEFINE_ALIAS
+from .math import maximum  #DEFINE_ALIAS
 from .math import min  #DEFINE_ALIAS
+from .math import minimum  #DEFINE_ALIAS
 from .math import mm  #DEFINE_ALIAS
-from .math import div  #DEFINE_ALIAS
+from .math import divide  #DEFINE_ALIAS
+from .math import floor_divide  #DEFINE_ALIAS
+from .math import remainder  #DEFINE_ALIAS
+from .math import mod  #DEFINE_ALIAS
+from .math import floor_mod  #DEFINE_ALIAS
 from .math import multiply  #DEFINE_ALIAS
 from .math import add  #DEFINE_ALIAS
 from .math import atan  #DEFINE_ALIAS
@@ -152,11 +158,16 @@ from .math import log1p  #DEFINE_ALIAS
 from .math import erf  #DEFINE_ALIAS
 from .math import addcmul  #DEFINE_ALIAS
 from .math import addmm  #DEFINE_ALIAS
-from .math import clamp  #DEFINE_ALIAS
+from .math import clip  #DEFINE_ALIAS
 from .math import trace  #DEFINE_ALIAS
 from .math import kron  #DEFINE_ALIAS
-# from .random import gaussin        #DEFINE_ALIAS
-# from .random import uniform        #DEFINE_ALIAS
+from .math import isfinite  #DEFINE_ALIAS
+from .math import isinf  #DEFINE_ALIAS
+from .math import isnan  #DEFINE_ALIAS
+from .math import prod  #DEFINE_ALIAS
+from .random import standard_normal
+from .random import normal
+from .random import uniform  #DEFINE_ALIAS
 from .random import shuffle  #DEFINE_ALIAS
 from .random import randn  #DEFINE_ALIAS
 from .random import rand  #DEFINE_ALIAS
@@ -174,10 +185,12 @@ from .search import index_select  #DEFINE_ALIAS
 from .search import nonzero  #DEFINE_ALIAS
 from .search import sort  #DEFINE_ALIAS
 from .search import index_sample  #DEFINE_ALIAS
+from .search import masked_select  #DEFINE_ALIAS
 from .stat import mean  #DEFINE_ALIAS
 from .stat import reduce_mean  #DEFINE_ALIAS
 from .stat import std  #DEFINE_ALIAS
 from .stat import var  #DEFINE_ALIAS
+from .stat import numel  #DEFINE_ALIAS
 # from .tensor import Tensor        #DEFINE_ALIAS
 # from .tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor import LoDTensorArray        #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 10f93f90fbb875f3fd546fb8b561ec0d1933294c..1911d8ccc25e01ee6419fd26126881304ab61f01 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 
 from __future__ import print_function
+import numpy as np
+
 from ..fluid.framework import Variable
+from ..fluid.framework import unique_name
+from ..fluid.framework import _current_expected_place
+from ..fluid.framework import dygraph_only
 from ..fluid.initializer import Constant
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
@@ -21,23 +26,17 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
 from ..fluid.layers import fill_constant
 from paddle.common_ops_import import *
-import paddle
 
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import crop_tensor  #DEFINE_ALIAS
-from ..fluid.layers import diag  #DEFINE_ALIAS
 from ..fluid.layers import fill_constant  #DEFINE_ALIAS
-from ..fluid.layers import create_tensor  #DEFINE_ALIAS
 from ..fluid.layers import linspace  #DEFINE_ALIAS
 import paddle
 
 __all__ = [
-    'create_tensor',
-    #       'create_lod_tensor',
-    #       'create_random_int_lodtensor',
+    'to_tensor',
     'crop_tensor',
     'diag',
-    'eye',
     'fill_constant',
     #       'get_tensor_from_selected_rows',
     'linspace',
@@ -55,10 +54,172 @@ __all__ = [
 ]
 
 
+@dygraph_only
+def to_tensor(data, dtype=None, place=None, stop_gradient=True):
+    """
+    Constructs a ``paddle.Tensor`` or ``paddle.ComplexTensor`` from ``data`` , 
+    which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
+
+    If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
+    will be performed and return origin tensor, otherwise a new tensor will be constructed
+    and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype`` 
+    and the current place is cpu, no copy will be performed.
+
+    The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then 
+    ``x.real`` is the real part, and ``x.imag`` is the imaginary part.
+
+    Args:
+        data(scalar|tuple|list|ndarray|Tensor|ComplexTensor): Initial data for the tensor.
+            Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
+        dtype(str, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+            'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
+            'complex64' , 'complex128' only for ComplexTensor.
+            Default: None, infers data type from ``data`` .
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
+        stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
+
+    Returns:
+        Tensor: A Tensor or ComplexTensor constructed from ``data``.
+
+    Raises:
+        TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor, paddle.ComplexTensor
+        ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
+        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
+        ValueError: If ``place`` is not paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+        paddle.enable_imperative()
+                
+        type(paddle.to_tensor(1))
+        # <class 'paddle.Tensor'>
+
+        paddle.to_tensor(1)
+        # Tensor: generated_tensor_0
+        # - place: CUDAPlace(0)   # allocate on global default place CPU:0
+        # - shape: [1]
+        # - layout: NCHW
+        # - dtype: int64_t
+        # - data: [1]
+
+        x = paddle.to_tensor(1)
+        paddle.to_tensor(x, dtype='int32', place=paddle.CPUPlace()) # A new tensor will be constructed due to different dtype or place
+        # Tensor: generated_tensor_01
+        # - place: CPUPlace
+        # - shape: [1]
+        # - layout: NCHW
+        # - dtype: int
+        # - data: [1]
+
+        paddle.to_tensor((1.1, 2.2), place=paddle.CUDAPinnedPlace())
+        # Tensor: generated_tensor_1
+        #   - place: CUDAPinnedPlace
+        #   - shape: [2]
+        #   - layout: NCHW
+        #   - dtype: double
+        #   - data: [1.1 2.2]
+
+        paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CUDAPlace(0), stop_gradient=False)
+        # Tensor: generated_tensor_2
+        #   - place: CUDAPlace(0)
+        #   - shape: [2, 2]
+        #   - layout: NCHW
+        #   - dtype: double
+        #   - data: [0.1 0.2 0.3 0.4]
+
+        type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), , dtype='complex64')
+        # <class 'paddle.ComplexTensor'>
+
+        paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
+        # ComplexTensor[real]: generated_tensor_0.real
+        #   - place: CUDAPlace(0)
+        #   - shape: [2, 2]
+        #   - layout: NCHW
+        #   - dtype: float
+        #   - data: [1 2 3 4]
+        # ComplexTensor[imag]: generated_tensor_0.imag
+        #   - place: CUDAPlace(0)
+        #   - shape: [2, 2]
+        #   - layout: NCHW
+        #   - dtype: float
+        #   - data: [1 0 2 0]
+    """
+
+    if place is None:
+        place = _current_expected_place()
+    elif not isinstance(place,
+                        (core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)):
+        raise ValueError(
+            "'place' must be any of paddle.Place, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
+        )
+
+    #Todo(zhouwei): Support allocate tensor on any other specified card
+    if isinstance(place, core.CUDAPlace) and isinstance(
+            _current_expected_place(), core.CUDAPlace) and place._get_device_id(
+            ) != _current_expected_place()._get_device_id():
+        place = _current_expected_place()
+
+    if not isinstance(data, np.ndarray):
+        if np.isscalar(data) and not isinstance(data, str):
+            data = np.array([data])
+        elif isinstance(data, (list, tuple)):
+            data = np.array(data)
+            if data.dtype == np.object:
+                raise ValueError(
+                    "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
+                    "this means the input data contains nested lists with different lengths. "
+                )
+        elif isinstance(data, paddle.Tensor):
+            data.stop_gradient = stop_gradient
+            if not data.place._equals(place):
+                data = data._copy_to(place, False)
+            if dtype:
+                if convert_dtype(dtype) != convert_dtype(data.dtype):
+                    return data.astype(convert_dtype(dtype))
+            return data
+        elif isinstance(data, paddle.ComplexTensor):
+            return data
+        else:
+            raise TypeError(
+                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
+                format(type(data)))
+
+    if dtype:
+        dtype = convert_dtype(dtype)
+        if dtype != data.dtype:
+            data = data.astype(dtype)
+
+    if not np.iscomplexobj(data):
+        return paddle.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=True,
+            stop_gradient=stop_gradient)
+    else:
+        name = unique_name.generate('generated_tensor')
+        real_tensor = paddle.Tensor(
+            value=data.real,
+            place=place,
+            zero_copy=True,
+            name=name + ".real",
+            stop_gradient=stop_gradient)
+        imag_tensor = paddle.Tensor(
+            value=data.imag,
+            place=place,
+            zero_copy=True,
+            name=name + ".imag",
+            stop_gradient=stop_gradient)
+        return paddle.ComplexTensor(real_tensor, imag_tensor)
+
+
 def full_like(x, fill_value, dtype=None, name=None):
     """
-	:alias_main: paddle.full_like
-	:alias: paddle.tensor.full_like, paddle.tensor.creation.full_like
 
     This function creates a tensor filled with ``fill_value`` which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
@@ -66,7 +227,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         fill_value(bool|float|int): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output. The data type can be one
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
             of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
             data type is the same as input.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
@@ -84,7 +245,7 @@ def full_like(x, fill_value, dtype=None, name=None):
           import paddle
           import numpy as np
           
-          paddle.enable_imperative()  # Now we are in imperative mode 
+          paddle.disable_static()  # Now we are in imperative mode 
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
           # [[2. 2. 2.]
@@ -121,14 +282,12 @@ def full_like(x, fill_value, dtype=None, name=None):
 
 def ones(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.ones
-	:alias: paddle.tensor.ones, paddle.tensor.creation.ones
 
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 1.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape is int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output Tensor, it supports
+        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64. Default: if None, the data type is 'float32'.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
     
@@ -144,7 +303,7 @@ def ones(shape, dtype=None, name=None):
         .. code-block:: python
 
           import paddle 
-          paddle.enable_imperative()
+          paddle.disable_static()
           
           # default dtype for ones OP
           data1 = paddle.ones(shape=[3, 2]) 
@@ -197,14 +356,14 @@ def ones_like(x, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.enable_imperative()
+            paddle.disable_static()
 
-        x = paddle.imperative.to_variable(np.array([1,2,3], dtype='float32'))
-        out1 = paddle.zeros_like(x) # [1., 1., 1.]
-        out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
+            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            out1 = paddle.zeros_like(x) # [1., 1., 1.]
+            out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
     """
     return full_like(x=x, fill_value=1, dtype=dtype, name=name)
@@ -212,14 +371,11 @@ def ones_like(x, dtype=None, name=None):
 
 def zeros(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.zeros
-	:alias: paddle.tensor.zeros, paddle.tensor.creation.zeros
-
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of ``shape`` is int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output Tensor, it supports
+        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
             bool, float16, float32, float64, int32 and int64. Default: if None, the date type is float32.
         name(str, optional): The default value is None.  Normally there is no need for user to set this
             property.  For more information, please refer to :ref:`api_guide_Name`.
@@ -237,7 +393,7 @@ def zeros(shape, dtype=None, name=None):
 
           import paddle
           
-          paddle.enable_imperative()  # Now we are in imperative mode
+          paddle.disable_static()  # Now we are in imperative mode
           data = paddle.zeros(shape=[3, 2], dtype='float32') 
           # [[0. 0.]
           #  [0. 0.]
@@ -248,7 +404,7 @@ def zeros(shape, dtype=None, name=None):
           
           # shape is a Tensor
           shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
-          data3 = paddle.ones(shape=shape, dtype='int32') 
+          data3 = paddle.zeros(shape=shape, dtype='int32') 
           # [[0 0]
           #  [0 0]]
     """
@@ -287,14 +443,14 @@ def zeros_like(x, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        paddle.enable_imperative()
+            paddle.disable_static()
 
-        x = paddle.imperative.to_variable(np.array([1,2,3], dtype='float32'))
-        out1 = paddle.zeros_like(x) # [0., 0., 0.]
-        out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
+            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            out1 = paddle.zeros_like(x) # [0., 0., 0.]
+            out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
     """
     return full_like(x=x, fill_value=0, dtype=dtype, name=name)
@@ -302,8 +458,6 @@ def zeros_like(x, dtype=None, name=None):
 
 def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-	:alias_main: paddle.eye
-	:alias: paddle.tensor.eye, paddle.tensor.creation.eye
     
     This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
@@ -311,7 +465,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
         num_rows(int): the number of rows in each batch Tensor.
         num_columns(int, optional): the number of columns in each batch Tensor.
             If None, default: num_rows.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned Tensor.
+        dtype(np.dtype|str, optional): The data type of the returned Tensor.
             It should be int32, int64, float16, float32, float64. Default: if None, the data type
             is float32.
         name(str, optional): The default value is None.  Normally there is no need for 
@@ -329,7 +483,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
           
           import paddle
 
-          paddle.enable_imperative()  # Now we are in imperative mode
+          paddle.disable_static()  # Now we are in imperative mode
           data = paddle.eye(3, dtype='int32')
           # [[1 0 0]
           #  [0 1 0]
@@ -352,8 +506,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
 def full(shape, fill_value, dtype=None, name=None):
     """
-	:alias_main: paddle.full
-	:alias: paddle.tensor.full, paddle.tensor.creation.full
 
     This Op return a Tensor with the ``fill_value`` which size is same as ``shape``.
     
@@ -364,7 +516,7 @@ def full(shape, fill_value, dtype=None, name=None):
                 If ``shape`` is an Tensor, it should be an 1-D Tensor .
         fill_value(bool|float|int|Tensor): The constant value
             used to initialize the Tensor to be created. If ``fill_value`` is an Tensor, it must be an 1-D Tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output Tensor
+        dtype(np.dtype|str, optional): Data type of the output Tensor
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created Tensor is `float32`
         name(str, optional): The default value is None.  Normally there is no need for user to set this
@@ -383,7 +535,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
           import paddle
 
-          paddle.enable_imperative()  # Now we are in imperative mode
+          paddle.disable_static()  # Now we are in imperative mode
           data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
           #[[0]
           # [0]]
@@ -460,7 +612,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         import paddle
         import numpy as np
 
-        paddle.enable_imperative()
+        paddle.disable_static()
 
         out1 = paddle.arange(5)
         # [0, 1, 2, 3, 4]
@@ -472,7 +624,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out3 = paddle.arange(4.999, dtype='float32')
         # [0., 1., 2., 3., 4.]
 
-        start_var = paddle.imperative.to_variable(np.array([3]))
+        start_var = paddle.to_tensor(np.array([3]))
         out4 = paddle.arange(start_var, 7)
         # [3, 4, 5, 6]
              
@@ -490,14 +642,13 @@ def _tril_triu_op(helper):
     """Base op of tril_op and triu_op
     """
     op_type = helper.layer_type
-    x = helper.kwargs.get('input', None)
+    x = helper.kwargs.get('x', None)
 
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              op_type)
     if len(x.shape) < 2:
-        raise ValueError("input shape in {} must be at least 2-D".format(
-            op_type))
+        raise ValueError("x shape in {} must be at least 2-D".format(op_type))
     diagonal = helper.kwargs.get('diagonal', 0)
     if not isinstance(diagonal, (int, )):
         raise TypeError("diagonal in {} must be a python Int".format(op_type))
@@ -521,18 +672,18 @@ def _tril_triu_op(helper):
     return out
 
 
-def tril(input, diagonal=0, name=None):
+def tril(x, diagonal=0, name=None):
     """
 	:alias_main: paddle.tril
 	:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril
 
     This op returns the lower triangular part of a matrix (2-D tensor) or batch
-    of matrices :attr:`input`, the other elements of the result tensor are set 
+    of matrices :attr:`x`, the other elements of the result tensor are set 
     to 0. The lower triangular part of the matrix is defined as the elements 
     on and below the diagonal.
 
     Args:
-        input (Variable): The input variable which is a Tensor.
+        x (Variable): The input variable x which is a Tensor.
             Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
         diagonal (int, optional): The diagonal to consider, default value is 0.
             If :attr:`diagonal` = 0, all elements on and below the main diagonal are
@@ -545,47 +696,41 @@ def tril(input, diagonal=0, name=None):
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor,
-        it's data type is the same as input's Tensor.
+        Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor x,
+        it's data type is the same as x's Tensor.
 
     Raises:
         TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`input` is less than 2.
+        ValueError: dimension of :attr:`x` is less than 2.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
-            import paddle.tensor as tensor
-            import paddle.fluid as fluid
+            import paddle
 
             data = np.arange(1, 13, dtype="int64").reshape(3,-1)
             # array([[ 1,  2,  3,  4],
             #        [ 5,  6,  7,  8],
             #        [ 9, 10, 11, 12]])
-            x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
-            exe = fluid.Executor(fluid.CPUPlace())
 
-            # example 1, default diagonal
-            tril = tensor.tril(x)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            paddle.disable_static()
+
+            x = paddle.to_variable(data)
+            
+            tril1 = paddle.tensor.tril(x)
             # array([[ 1,  0,  0,  0],
             #        [ 5,  6,  0,  0],
             #        [ 9, 10, 11,  0]])
 
             # example 2, positive diagonal value
-            tril = tensor.tril(x, diagonal=2)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            tril2 = paddle.tensor.tril(x, diagonal=2)
             # array([[ 1,  2,  3,  0], 
             #        [ 5,  6,  7,  8],
             #        [ 9, 10, 11, 12]])
 
             # example 3, negative diagonal value
-            tril = tensor.tril(x, diagonal=-1)
-            tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[tril], return_numpy=True)
+            tril3 = paddle.tensor.tril(x, diagonal=-1)
             # array([[ 0,  0,  0,  0],
             #        [ 5,  0,  0,  0],
             #        [ 9, 10,  0,  0]])
@@ -593,23 +738,23 @@ def tril(input, diagonal=0, name=None):
     """
     if in_dygraph_mode():
         op = getattr(core.ops, 'tril_triu')
-        return op(input, 'diagonal', diagonal, "lower", True)
+        return op(x, 'diagonal', diagonal, "lower", True)
 
     return _tril_triu_op(LayerHelper('tril', **locals()))
 
 
-def triu(input, diagonal=0, name=None):
+def triu(x, diagonal=0, name=None):
     """
 	:alias_main: paddle.triu
 	:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu
 
     This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
-    :attr:`input`, the other elements of the result tensor are set to 0.
+    :attr:`x`, the other elements of the result tensor are set to 0.
     The upper triangular part of the matrix is defined as the elements on and
     above the diagonal.
 
     Args:
-        input (Variable): The input variable which is a Tensor.
+        x (Variable): The input variable x which is a Tensor.
             Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
         diagonal (int, optional): The diagonal to consider, default value is 0.
             If :attr:`diagonal` = 0, all elements on and above the main diagonal are
@@ -622,47 +767,41 @@ def triu(input, diagonal=0, name=None):
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor,
-        it's data type is the same as input's Tensor.
+        Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor x,
+        it's data type is the same as x's Tensor.
 
     Raises:
         TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`input` is less than 2.
+        ValueError: dimension of :attr:`x` is less than 2.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            import paddle.tensor as tensor
+            import paddle
 
             data = np.arange(1, 13, dtype="int64").reshape(3,-1)
             # array([[ 1,  2,  3,  4],
             #        [ 5,  6,  7,  8],
             #        [ 9, 10, 11, 12]])
-            x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
-            exe = fluid.Executor(fluid.CPUPlace())
+
+            paddle.disable_static()
 
             # example 1, default diagonal
-            triu = tensor.triu(x)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            x = paddle.to_variable(data)
+            triu1 = paddle.tensor.triu(x)
             # array([[ 1,  2,  3,  4],
             #        [ 0,  6,  7,  8],
             #        [ 0,  0, 11, 12]])
 
             # example 2, positive diagonal value
-            triu = tensor.triu(x, diagonal=2)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            triu2 = paddle.tensor.triu(x, diagonal=2)
             # array([[0, 0, 3, 4],
             #        [0, 0, 0, 8],
             #        [0, 0, 0, 0]])
 
             # example 3, negative diagonal value
-            triu = tensor.triu(x, diagonal=-1)
-            triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
-                fetch_list=[triu], return_numpy=True)
+            triu3 = paddle.tensor.triu(x, diagonal=-1)
             # array([[ 1,  2,  3,  4],
             #        [ 5,  6,  7,  8],
             #        [ 0, 10, 11, 12]])
@@ -670,7 +809,7 @@ def triu(input, diagonal=0, name=None):
     """
     if in_dygraph_mode():
         op = getattr(core.ops, 'tril_triu')
-        return op(input, 'diagonal', diagonal, "lower", False)
+        return op(x, 'diagonal', diagonal, "lower", False)
 
     return _tril_triu_op(LayerHelper('triu', **locals()))
 
@@ -723,12 +862,12 @@ def meshgrid(*args, **kwargs):
           import paddle
           import numpy as np
           
-          paddle.enable_imperative()
+          paddle.disable_static()
 
           input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
           input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
-          tensor_3 = paddle.imperative.to_variable(input_3)
-          tensor_4 = paddle.imperative.to_variable(input_4)
+          tensor_3 = paddle.to_tensor(input_3)
+          tensor_4 = paddle.to_tensor(input_4)
           grid_x, grid_y = paddle.tensor.meshgrid(tensor_3, tensor_4)
 
           #the shape of grid_x is (100, 200)
@@ -763,3 +902,92 @@ def meshgrid(*args, **kwargs):
         type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out})
 
     return out
+
+
+def diag(x, offset=0, padding_value=0, name=None):
+    """
+    If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+    If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
+
+    The argument ``offset`` controls the diagonal offset:
+
+    If ``offset`` = 0, it is the main diagonal.
+
+    If ``offset`` > 0, it is superdiagonal.
+
+    If ``offset`` < 0, it is subdiagonal.
+
+    Args:
+        x (Tensor): The input tensor. Its shape is either 1-D or 2-D. Its data type should be float32, float64, int32, int64.
+        offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal.
+        padding_value (int|float, optional): Use this value to fill the area outside the specified diagonal band. Only takes effect when the input is a 1-D Tensor. The default value is 0.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, a square matrix or a vector. The output data type is the same as input data type.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+
+          paddle.disable_static()
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.diag(x)
+          print(y.numpy())
+          # [[1 0 0]
+          #  [0 2 0]
+          #  [0 0 3]]
+
+          y = paddle.diag(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0]
+          #  [0 0 2 0]
+          #  [0 0 0 3]
+          #  [0 0 0 0]]
+
+          y = paddle.diag(x, padding_value=6)
+          print(y.numpy())
+          # [[1 6 6]
+          #  [6 2 6]
+          #  [6 6 3]]
+
+        .. code-block:: python
+
+          import paddle
+
+          paddle.disable_static()
+          x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+          y = paddle.diag(x)
+          print(y.numpy())
+          # [1 5]
+
+          y = paddle.diag(x, offset=1)
+          print(y.numpy())
+          # [2 6]
+
+          y = paddle.diag(x, offset=-1)
+          print(y.numpy())
+          # [4]
+    """
+    if in_dygraph_mode():
+        return core.ops.diag_v2(x, "offset", offset, "padding_value",
+                                padding_value)
+
+    check_type(x, 'x', (Variable), 'diag_v2')
+    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
+                'diag_v2')
+    helper = LayerHelper("diag_v2", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='diag_v2',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'offset': offset,
+               'padding_value': padding_value})
+
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 6b67394b6bd250282e2ea8f13134503ac6cbfc0a..a7bf2272a599ef2d6de076e7129b43152ca47b06 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -35,135 +35,134 @@ __all__ = [
 ]
 
 
-def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
+def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-	:alias_main: paddle.matmul
-	:alias: paddle.matmul,paddle.tensor.matmul,paddle.tensor.linalg.matmul
+    Applies matrix multiplication to two tensors. `matmul` follows 
+    the complete broadcast rules, 
+    and its behavior is consistent with `np.matmul`.
 
-    Applies matrix multiplication to two tensors.
-
-    Currently, the input tensors' rank can be any, but when the rank of any
-    inputs is bigger than 3, this two inputs' rank should be equal.
+    Currently, the input tensors' number of dimensions can be any, `matmul` can be used to
+    achieve the `dot`, `matmul` and `batchmatmul`.
 
     The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
     - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
-      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
-      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
-      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
-      :math:`[1, D]` in transposed form.
-
-    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
-      performs in the following way.
-
-      - If both are 2-D, they are multiplied like conventional matrices.
-      - If either is n-D, it is treated as a stack of matrices residing in the
-        last two dimensions and a batched matrix multiply supporting broadcast
-        applies on the two tensors.
-
-    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
-    nontransposed, the prepended or appended dimension :math:`1` will be
-    removed after matrix multiplication.
+      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor 
+      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas 
+      for :math:`y` it is the opposite: It is treated as :math:`[D, 1]`.
+
+    The multiplication behavior depends on the dimensions of `x` and `y`. Specifically:
+
+    - If both tensors are 1-dimensional, the dot product result is obtained.
+
+    - If both tensors are 2-dimensional, the matrix-matrix product is obtained.
+
+    - If the `x` is 1-dimensional and the `y` is 2-dimensional, 
+      a `1` is prepended to its dimension in order to conduct the matrix multiply. 
+      After the matrix multiply, the prepended dimension is removed.
+      
+    - If the `x` is 2-dimensional and `y` is 1-dimensional, 
+      the matrix-vector product is obtained.
+
+    - If both arguments are at least 1-dimensional and at least one argument 
+      is N-dimensional (where N > 2), then a batched matrix multiply is obtained. 
+      If the first argument is 1-dimensional, a 1 is prepended to its dimension 
+      in order to conduct the batched matrix multiply and removed after. 
+      If the second argument is 1-dimensional, a 1 is appended to its 
+      dimension for the purpose of the batched matrix multiple and removed after. 
+      The non-matrix (exclude the last two dimensions) dimensions are 
+      broadcasted according the broadcast rule. 
+      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor, 
+      out will be a (j, k, n, p) tensor.
 
     Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a Tensor or LoDTensor.
+        x (Tensor): The input tensor which is a Tensor.
+        y (Tensor): The input tensor which is a Tensor.
         transpose_x (bool): Whether to transpose :math:`x` before multiplication.
         transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        alpha (float): The scale of output. Default 1.0.
         name(str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
 
     Returns:
-        Variable: The product Tensor (or LoDTensor) variable.
+        Tensor: The output Tensor.
 
     Examples:
-        .. code-block:: python
-
-            # Examples to clarify shapes of the inputs and output
-            # x: [B, ..., M, K], y: [B, ..., K, N]
-            # paddle.matmul(x, y)  # out: [B, ..., M, N]
-
-            # x: [B, M, K], y: [B, K, N]
-            # paddle.matmul(x, y)  # out: [B, M, N]
 
-            # x: [B, M, K], y: [K, N]
-            # paddle.matmul(x, y)  # out: [B, M, N]
-
-            # x: [M, K], y: [K, N]
-            # paddle.matmul(x, y)  # out: [M, N]
-
-            # x: [B, M, K], y: [K]
-            # paddle.matmul(x, y)  # out: [B, M]
+    .. code-block:: python
 
-            # x: [K], y: [K]
-            # paddle.matmul(x, y)  # out: [1]
+        import paddle
+        import numpy as np
 
-            # x: [M], y: [N]
-            # paddle.matmul(x, y, True, True)  # out: [M, N]
+        paddle.disable_static()
+        # vector * vector
+        x_data = np.random.random([10]).astype(np.float32)
+        y_data = np.random.random([10]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [1]
+
+        # matrix * vector
+        x_data = np.random.random([10, 5]).astype(np.float32)
+        y_data = np.random.random([5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10]
+
+        # batched matrix * broadcasted vector
+        x_data = np.random.random([10, 5, 2]).astype(np.float32)
+        y_data = np.random.random([2]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 5]
+
+        # batched matrix * batched matrix
+        x_data = np.random.random([10, 5, 2]).astype(np.float32)
+        y_data = np.random.random([10, 2, 5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 5, 5]
+
+        # batched matrix * broadcasted matrix
+        x_data = np.random.random([10, 1, 5, 2]).astype(np.float32)
+        y_data = np.random.random([1, 3, 2, 5]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.matmul(x, y)
+        print(z.numpy().shape)
+        # [10, 3, 5, 5]
 
-            import paddle
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3], dtype='float32')
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-            out = paddle.matmul(x, y, True, True)
     """
+    op_type = 'matmul_v2'
+    if in_dygraph_mode():
+        op = getattr(core.ops, op_type)
+        return op(x, y, 'trans_x', transpose_x, 'trans_y', transpose_y)
+
     attrs = {
-        'transpose_X': transpose_x,
-        'transpose_Y': transpose_y,
-        'alpha': float(alpha),
+        'trans_x': transpose_x,
+        'trans_y': transpose_y,
     }
 
-    if in_dygraph_mode():
-        out = _varbase_creator(dtype=x.dtype)
-        core.ops.matmul(x, y, out, 'transpose_X', transpose_x, 'transpose_Y',
-                        transpose_y, 'alpha', float(alpha))
-        return out
-
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul')
-        x_shape = list(x.shape)
-        y_shape = list(y.shape)
-        if len(x_shape) == 1:
-            x_shape = [1] + x_shape
-        if len(y_shape) == 1:
-            y_shape = y_shape + [1]
-
-        # check the inner 2 dimensions
-        if transpose_x:
-            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
-        if transpose_y:
-            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
-        if x_shape[-1] != y_shape[-2]:
-            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
-                "After performing an optional transpose, Input X's width should be "   \
-                "equal to Y's width for multiplication "                               \
-                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
-                (x_shape, y_shape)
-
-        if len(y_shape) > 2 and len(x_shape) > 2:
-            for i, dim_x in enumerate(x_shape[:-2]):
-                # don't check neg shape
-                if dim_x < 0 or y_shape[i] < 0:
-                    continue
-                if dim_x != y_shape[i]:
-                    raise ValueError(
-                        "When the matrix is larger than 2 dimensions, the higher "
-                        "dimensional values of the two matrices need to be equal. "
-                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
-                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+            check_variable_and_dtype(val, name, ['float32', 'float64'],
+                                     'matmul')
 
     __check_input(x, y)
 
-    helper = LayerHelper('matmul', **locals())
+    helper = LayerHelper('matmul_v2', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
-        type='matmul',
+        type='matmul_v2',
         inputs={'X': x,
                 'Y': y},
         outputs={'Out': out},
@@ -452,35 +451,34 @@ def dist(x, y, p=2):
 
 def dot(x, y, name=None):
     """
-	:alias_main: paddle.dot
-	:alias: paddle.dot,paddle.tensor.dot,paddle.tensor.linalg.dot
-
     This operator calculates inner product for vectors.
    
     .. note::
-       Only support 1-d Tensor(vector).
+       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix 
+       is the batch dimension, which means that the vectors of multiple batches are dotted. 
 
     Parameters:
-        x(Variable): 1-D ``Tensor`` or ``LoDTensor``. Its datatype should be ``float32``, ``float64``, ``int32``, ``int64``
-        y(Variable): 1-D ``Tensor`` or ``LoDTensor``. Its datatype soulde be ``float32``, ``float64``, ``int32``, ``int64``
+        x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``
+        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype soulde be ``float32``, ``float64``, ``int32``, ``int64``
         name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
 
     Returns:
-        Variable: the calculated result Tensor/LoDTensor.
+        Variable: the calculated result Tensor.
 
     Examples:
 
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
-        
-        with fluid.dygraph.guard():
-          x = fluid.dygraph.to_variable(np.random.uniform(0.1, 1, [10]).astype(np.float32))
-          y = fluid.dygraph.to_variable(np.random.uniform(1, 3, [10]).astype(np.float32))
-          z = paddle.dot(x, y)
-          print(z.numpy())
+
+        paddle.disable_static()
+        x_data = np.random.uniform(0.1, 1, [10]).astype(np.float32)
+        y_data = np.random.uniform(1, 3, [10]).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        z = paddle.dot(x, y)
+        print(z.numpy())
 
     """
     op_type = 'dot'
@@ -605,10 +603,10 @@ def cross(x, y, axis=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            from paddle.imperative import to_variable
+            from paddle import to_variable
             import numpy as np
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
             data_x = np.array([[1.0, 1.0, 1.0],
                                [2.0, 2.0, 2.0],
@@ -651,11 +649,8 @@ def cross(x, y, axis=None, name=None):
     return out
 
 
-def cholesky(x, upper=False):
+def cholesky(x, upper=False, name=None):
     """
-	:alias_main: paddle.cholesky
-	:alias: paddle.cholesky,paddle.tensor.cholesky,paddle.tensor.linalg.cholesky
-
     Computes the Cholesky decomposition of one symmetric positive-definite
     matrix or batches of symmetric positive-definite matrice. 
     
@@ -680,21 +675,22 @@ def cholesky(x, upper=False):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
-            with fluid.dygraph.guard():
-                a = np.random.rand(3, 3)
-                a_t = np.transpose(a, [1, 0])
-                x = np.matmul(a, a_t) + 1e-03
-                x = fluid.dygraph.to_variable(x)
-                out = paddle.cholesky(x, upper=False)
-                print(out.numpy())
-                # [[1.190523   0.         0.        ]
-                #  [0.9906703  0.27676893 0.        ]
-                #  [1.25450498 0.05600871 0.06400121]]
+            paddle.disable_static()
+            a = np.random.rand(3, 3)
+            a_t = np.transpose(a, [1, 0])
+            x_data = np.matmul(a, a_t) + 1e-03
+            x = paddle.to_variable(x_data)
+            out = paddle.cholesky(x, upper=False)
+            print(out.numpy())
+            # [[1.190523   0.         0.        ]
+            #  [0.9906703  0.27676893 0.        ]
+            #  [1.25450498 0.05600871 0.06400121]]
 
     """
+    if in_dygraph_mode():
+        return core.ops.cholesky(x, "upper", upper)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cholesky')
     check_type(upper, 'upper', bool, 'cholesky')
     helper = LayerHelper('cholesky', **locals())
@@ -729,26 +725,32 @@ def bmm(x, y, name=None):
 
     Examples:
         import paddle
-        import paddle.fluid as fluid
-        x = fluid.layers.data(name='x', shape=[10, 3, 4], dtype='float32')
-        y = fluid.layers.data(name='y', shape=[10, 4, 5], dtype='float32')
-        out = paddle.bmm(x, y)
-    
-        # In dygraph mode:
+
+        # In imperative mode:
         # size input1: (2, 2, 3) and input2: (2, 3, 2)
         input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
         input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
 
-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(input1)
-            y = fluid.dygraph.to_variable(input2)
-            out = paddle.bmm(x, y)
-            #output size: (2, 2, 2)
-            #output value:
-            #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
-            out_np = out.numpy()
+        paddle.disable_static()
+        
+        x = paddle.to_variable(input1)
+        y = paddle.to_variable(input2)
+        out = paddle.bmm(x, y)
+        #output size: (2, 2, 2)
+        #output value:
+        #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
+        out_np = out.numpy()
     """
-
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(x_shape) == len(y_shape) == 3:
+        raise ValueError(
+            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".
+            format(x_shape, y_shape))
+    if x_shape[2] != y_shape[1]:
+        raise ValueError(
+            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
+            format(x_shape, y_shape))
     helper = LayerHelper('bmm', **locals())
     if in_dygraph_mode():
         return core.ops.bmm(x, y)
@@ -776,13 +778,13 @@ def histogram(input, bins=100, min=0, max=0):
         .. code-block:: python
             import paddle
             import numpy as np
-            startup_program = paddle.Program()
-            train_program = paddle.Program()
-            with paddle.program_guard(train_program, startup_program):
+            startup_program = paddle.static.Program()
+            train_program = paddle.static.Program()
+            with paddle.static.program_guard(train_program, startup_program):
                 inputs = paddle.data(name='input', dtype='int32', shape=[2,3])
                 output = paddle.histogram(inputs, bins=5, min=1, max=5)
                 place = paddle.CPUPlace()
-                exe = paddle.Executor(place)
+                exe = paddle.static.Executor(place)
                 exe.run(startup_program)
                 img = np.array([[2, 4, 2], [2, 5, 4]]).astype(np.int32)
                 res = exe.run(train_program,
@@ -794,11 +796,12 @@ def histogram(input, bins=100, min=0, max=0):
         .. code-block:: python
             import paddle
             import numpy as np
-            with paddle.imperative.guard(paddle.CPUPlace()):
-                inputs_np = np.array([1, 2, 1]).astype(np.float)
-                inputs = paddle.imperative.to_variable(inputs_np)
-                result = paddle.histogram(inputs, bins=4, min=0, max=3)
-                print(result) # [0, 2, 1, 0]
+            paddle.disable_static(paddle.CPUPlace())
+            inputs_np = np.array([1, 2, 1]).astype(np.float)
+            inputs = paddle.to_variable(inputs_np)
+            result = paddle.histogram(inputs, bins=4, min=0, max=3)
+            print(result) # [0, 2, 1, 0]
+            paddle.enable_static()
     """
     if in_dygraph_mode():
         return core.ops.histogram(input, "bins", bins, "min", min, "max", max)
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 936022dd73b31f2d5839cc7e8698c6757378d874..36b558d597c1ce1333a8f1eec54e2fd2813625e3 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import check_type
+from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
 from .. import fluid
+from ..fluid.framework import in_dygraph_mode
+from paddle.common_ops_import import *
 
 # TODO: define logic functions of a tensor  
 from ..fluid.layers import is_empty  #DEFINE_ALIAS
@@ -71,12 +73,11 @@ def equal_all(x, y, name=None):
 
           import numpy as np
           import paddle
-          import paddle.imperative as imperative
 
-          paddle.enable_imperative()
-          x = imperative.to_variable(np.array([1, 2, 3]))
-          y = imperative.to_variable(np.array([1, 2, 3]))
-          z = imperative.to_variable(np.array([1, 4, 3]))
+          paddle.disable_static()
+          x = paddle.to_variable(np.array([1, 2, 3]))
+          y = paddle.to_variable(np.array([1, 2, 3]))
+          z = paddle.to_variable(np.array([1, 4, 3]))
           result1 = paddle.equal_all(x, y)
           print(result1.numpy()) # result1 = [True ]
           result2 = paddle.equal_all(x, z)
@@ -92,75 +93,70 @@ def equal_all(x, y, name=None):
 
 
 @templatedoc()
-def allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
+def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     """
-	:alias_main: paddle.allclose
-	:alias: paddle.allclose,paddle.tensor.allclose,paddle.tensor.logic.allclose
-
     ${comment}
 
     Args:
-        input(inputtype):{input_comment}.
-        other(othertype):{other_comment}.
-        rtol(rtoltype,optional):{rtol_comment}.
-        atol(atoltype,optional):{atol_comment}.
-        equal_nan(equalnantype,optional):{equal_nan_comment}.
-        name(STR, optional): The default value is None.
-                        Normally there is no need for user to set this property.
-                        For more information, please refer to :ref:`api_guide_Name`.
+        x(Tensor): ${input_comment}.
+        y(Tensor): ${other_comment}.
+        rtol(rtoltype, optional): ${rtol_comment}.
+        atol(atoltype, optional): ${atol_comment}.
+        equal_nan(equalnantype, optional): ${equal_nan_comment}.
+        name (str, optional): Name for the operation. For more information, please
+            refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
-        ${out_comment}.
+        Tensor: ${out_comment}.
+
+    Raises:
+        TypeError: The data type of ``x`` must be one of float32, float64.
+        TypeError: The data type of ``y`` must be one of float32, float64.
+        TypeError: The type of ``rtol`` must be float.
+        TypeError: The type of ``atol`` must be float.
+        TypeError: The type of ``equal_nan`` must be bool.
 
-    Return Type:
-        ${out_type}
-        
     Examples:
         .. code-block:: python
 
           import paddle
-          import paddle.fluid as fluid
           import numpy as np
 
-          use_cuda = fluid.core.is_compiled_with_cuda()
+          paddle.disable_static()
 
-          a = fluid.data(name="a", shape=[2], dtype='float32')
-          b = fluid.data(name="b", shape=[2], dtype='float32')
-
-          result = paddle.allclose(a, b, rtol=1e-05, atol=1e-08,
+          np_x = np.array([10000., 1e-07]).astype("float32")
+          np_y = np.array([10000.1, 1e-08]).astype("float32")
+          x = paddle.to_tensor(np_x)
+          y = paddle.to_tensor(np_y)
+          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          result_nan = paddle.allclose(a, b, rtol=1e-05, atol=1e-08,
+          np_result1 = result1.numpy()
+          # [False]
+          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-
-          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-
-          x = np.array([10000., 1e-07]).astype("float32")
-          y = np.array([10000.1, 1e-08]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([False]), array([False]))
-
-          x = np.array([10000., 1e-08]).astype("float32")
-          y = np.array([10000.1, 1e-09]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([ True]), array([ True]))
-
-          x = np.array([1.0, float('nan')]).astype("float32")
-          y = np.array([1.0, float('nan')]).astype("float32")
-          result_v, result_nan_v = exe.run(
-              feed={'a': x, 'b': y},
-              fetch_list=[result, result_nan])
-          print(result_v, result_nan_v)
-          # Output: (array([False]), array([ True]))
+          np_result2 = result2.numpy()
+          # [False]
+
+          np_x = np.array([1.0, float('nan')]).astype("float32")
+          np_y = np.array([1.0, float('nan')]).astype("float32")
+          x = paddle.to_tensor(np_x)
+          y = paddle.to_tensor(np_y)
+          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+                                  equal_nan=False, name="ignore_nan")
+          np_result1 = result1.numpy()
+          # [False]
+          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+                                      equal_nan=True, name="equal_nan")
+          np_result2 = result2.numpy()
+          # [True]
     """
 
+    if in_dygraph_mode():
+        return core.ops.allclose(x, y, 'rtol', rtol, 'atol', atol, 'equal_nan',
+                                 equal_nan)
+
+    check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
+    check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
     check_type(atol, 'atol', float, 'allclose')
     check_type(equal_nan, 'equal_nan', bool, 'allclose')
@@ -168,7 +164,7 @@ def allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     helper = LayerHelper("allclose", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
 
-    inputs = {'Input': input, 'Other': other}
+    inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': rtol, 'atol': atol, 'equal_nan': equal_nan}
     helper.append_op(
@@ -201,11 +197,10 @@ def equal(x, y, name=None):
 
           import numpy as np
           import paddle
-          import paddle.imperative as imperative
 
-          paddle.enable_imperative()
-          x = imperative.to_variable(np.array([1, 2, 3]))
-          y = imperative.to_variable(np.array([1, 3, 2]))
+          paddle.disable_static()
+          x = paddle.to_variable(np.array([1, 2, 3]))
+          y = paddle.to_variable(np.array([1, 3, 2]))
           result1 = paddle.equal(x, y)
           print(result1.numpy())  # result1 = [True False False]
     """
@@ -234,11 +229,10 @@ def greater_equal(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.greater_equal(x, y)
             print(result1.numpy())  # result1 = [True False True]
     """
@@ -267,11 +261,10 @@ def greater_than(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.greater_than(x, y)
             print(result1.numpy())  # result1 = [False False True]
     """
@@ -301,11 +294,10 @@ def less_equal(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.less_equal(x, y)
             print(result1.numpy())  # result1 = [True True False]
     """
@@ -335,11 +327,10 @@ def less_than(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.less_than(x, y)
             print(result1.numpy())  # result1 = [False True False]
     """
@@ -369,11 +360,10 @@ def not_equal(x, y, name=None):
         .. code-block:: python
             import numpy as np
             import paddle
-            import paddle.imperative as imperative
 
-            paddle.enable_imperative()
-            x = imperative.to_variable(np.array([1, 2, 3]))
-            y = imperative.to_variable(np.array([1, 3, 2]))
+            paddle.disable_static()
+            x = paddle.to_variable(np.array([1, 2, 3]))
+            y = paddle.to_variable(np.array([1, 3, 2]))
             result1 = paddle.not_equal(x, y)
             print(result1.numpy())  # result1 = [False True True]
     """
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index c2f67b4e13855b1a3e29e2bdd675dbf418b0a9a1..44ec0a5a4df985d5217011a841065ce504483ab7 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..fluid.layers import core, reshape
+from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -23,18 +23,12 @@ from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
-from ..fluid.layers import expand  #DEFINE_ALIAS
 from ..fluid.layers import expand_as  #DEFINE_ALIAS
-from ..fluid.layers import flatten  #DEFINE_ALIAS
-from ..fluid.layers import reshape  #DEFINE_ALIAS
-from ..fluid.layers import scatter  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unique  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
-from ..fluid.layers import gather_nd  #DEFINE_ALIAS
 from ..fluid.layers import scatter_nd_add  #DEFINE_ALIAS
 from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
 from ..fluid.layers import shard_index  #DEFINE_ALIAS
@@ -43,11 +37,35 @@ from ..fluid import layers
 import paddle
 
 __all__ = [
-    'cast', 'concat', 'expand', 'expand_as', 'flatten', 'gather', 'gather_nd',
-    'reshape', 'reverse', 'scatter', 'scatter_nd_add', 'scatter_nd',
-    'shard_index', 'slice', 'split', 'squeeze', 'stack', 'strided_slice',
-    'transpose', 'unique', 'unique_with_counts', 'unsqueeze', 'unstack', 'flip',
-    'unbind', 'roll'
+    'cast',
+    'concat',
+    'expand',
+    'broadcast_to',
+    'expand_as',
+    'flatten',
+    'gather',
+    'gather_nd',
+    'reshape',
+    'reverse',
+    'scatter',
+    'scatter_nd_add',
+    'scatter_nd',
+    'shard_index',
+    'slice',
+    'split',
+    'chunk'
+    'squeeze',
+    'stack',
+    'strided_slice',
+    'transpose',
+    'unique',
+    'unique_with_counts',
+    'unsqueeze',
+    'unstack',
+    'flip',
+    'unbind',
+    'roll',
+    'tile',
 ]
 
 
@@ -83,16 +101,16 @@ def concat(x, axis=0, name=None):
             import paddle
             import numpy as np
             
-            paddle.enable_imperative()  # Now we are in imperative mode
+            paddle.disable_static()  # Now we are in imperative mode
             in1 = np.array([[1, 2, 3],
                             [4, 5, 6]])
             in2 = np.array([[11, 12, 13],
                             [14, 15, 16]])
             in3 = np.array([[21, 22],
                             [23, 24]])
-            x1 = paddle.imperative.to_variable(in1)
-            x2 = paddle.imperative.to_variable(in2)
-            x3 = paddle.imperative.to_variable(in3)
+            x1 = paddle.to_tensor(in1)
+            x2 = paddle.to_tensor(in2)
+            x3 = paddle.to_tensor(in3)
             zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
             # When the axis is negative, the real axis is (axis + Rank(x))
             # As follow, axis is -1, Rank(x) is 2, the real axis is 1
@@ -136,12 +154,12 @@ def flip(x, axis, name=None):
           import paddle
           import numpy as np
 
-          paddle.enable_imperative()
+          paddle.disable_static()
 
           image_shape=(3, 2, 2)
           x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
           x = x.astype('float32')
-          img = paddle.imperative.to_variable(x)
+          img = paddle.to_variable(x)
           out = paddle.flip(img, [0,1])
 
           print(out) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
@@ -169,6 +187,114 @@ def flip(x, axis, name=None):
 reverse = flip  #DEFINE_ALIAS
 
 
+def flatten(x, start_axis=0, stop_axis=-1, name=None):
+    """
+    **Flatten op**
+
+    Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
+
+    For Example:
+
+    .. code-block:: text
+
+        Case 1:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            start_axis = 1
+            end_axis = 2
+
+          We get:
+            Out.shape = (3, 1000 * 100, 2)
+
+        Case 2:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            start_axis = 0
+            stop_axis = -1
+
+          We get:
+            Out.shape = (3 * 100 * 100 * 4)
+
+    Args:
+        x (Variable): A tensor of number of dimentions >= axis. A tensor with data type float32,
+                      float64, int8, int32, int64.
+        start_axis (int): the start axis to flatten
+        stop_axis (int): the stop axis to flatten
+        name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
+                        Generally, no setting is required. Default: None.
+
+    Returns:
+        Variable: A tensor with the contents of the input tensor, with input \
+                  axes flattened by indicated start axis and end axis. \
+                  A Tensor with data type same as input x.
+
+    Raises:
+        ValueError: If x is not a Variable.
+        ValueError: If start_axis or stop_axis is illegal.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            image_shape=(2, 3, 4, 4)
+            x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
+            x = x.astype('float32')
+            
+            img = paddle.to_variable(x)
+            out = paddle.flatten(img, start_axis=1, stop_axis=2)
+            # out shape is [2, 12, 4]
+    """
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Variable")
+
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
+    helper = LayerHelper('flatten', **locals())
+
+    x_dim = len(x.shape)
+    if not (isinstance(start_axis, int)) or (
+            start_axis > x_dim - 1) or start_axis < -x_dim:
+        raise ValueError(
+            "The start_axis should be a int, and in range [-rank(x), rank(x))")
+    if not (isinstance(stop_axis, int)) or (
+            stop_axis > x_dim - 1) or stop_axis < -x_dim:
+        raise ValueError(
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+    if start_axis < 0:
+        start_axis = start_axis + x_dim
+    if stop_axis < 0:
+        stop_axis = stop_axis + x_dim
+    if start_axis > stop_axis:
+        raise ValueError("The stop_axis should be larger than stat_axis")
+
+    if in_dygraph_mode():
+        dy_out, _ = core.ops.flatten_contiguous_range(
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
+        return dy_out
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='flatten_contiguous_range',
+        inputs={"X": x},
+        outputs={'Out': out,
+                 'XShape': x_shape},
+        attrs={"start_axis": start_axis,
+               "stop_axis": stop_axis})
+    return out
+
+
 def roll(x, shifts, axis=None, name=None):
     """
 	:alias_main: paddle.roll
@@ -197,8 +323,8 @@ def roll(x, shifts, axis=None, name=None):
             data = np.array([[1.0, 2.0, 3.0],
                              [4.0, 5.0, 6.0],
                              [7.0, 8.0, 9.0]])
-            paddle.enable_imperative()
-            x = paddle.imperative.to_variable(data)
+            paddle.disable_static()
+            x = paddle.to_variable(data)
             out_z1 = paddle.roll(x, shifts=1)
             print(out_z1.numpy())
             #[[9. 1. 2.]
@@ -248,17 +374,22 @@ def roll(x, shifts, axis=None, name=None):
         outputs={'Out': out},
         attrs={'axis': axis,
                'shifts': shifts})
-    out = reshape(out, shape=origin_shape, inplace=True)
+    out = layers.reshape(out, shape=origin_shape, inplace=True)
     return out
 
 
-def stack(x, axis=0, out=None, name=None):
+def stack(x, axis=0, name=None):
     """
 	:alias_main: paddle.stack
-	:alias: paddle.stack,paddle.tensor.stack,paddle.tensor.manipulation.stack
-
+	:alias: paddle.stack, paddle.tensor.stack, paddle.tensor.manipulation.stack
 
-    This OP stacks all the inputs :code:`x` along axis.
+    This OP stacks all the input tensors ``x`` along ``axis`` dimemsion. 
+    All tensors must be of the same shape and same dtype.
+    
+    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked 
+    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked 
+    tensor is [A, N, B], etc.
+    
 
     .. code-block:: text
 
@@ -284,7 +415,6 @@ def stack(x, axis=0, out=None, name=None):
 
         Case 2:
 
-
           Input:
             x[0].shape = [1, 2]
             x[0].data = [ [1.0 , 2.0 ] ]
@@ -295,7 +425,7 @@ def stack(x, axis=0, out=None, name=None):
 
 
           Attrs:
-            axis = 1 or axis = -2
+            axis = 1 or axis = -2  # If axis = -2, axis = axis+ndim(x[0])+1 = -2+2+1 = 1.
 
           Output:
             Out.shape = [1, 3, 2]
@@ -304,72 +434,44 @@ def stack(x, axis=0, out=None, name=None):
                           [5.0, 6.0] ] ]
 
     Args:
-        x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
-                                     If :code:`x` is a :code:`list`, the shapes of all these Tensors
-                                     must be the same. Supposing input is N dims
-                                     Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
-                                     Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
-                                     Support data types: float32, float64, int32, int64.
-        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
-                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
-                              The default value of axis is 0.
-
+        x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
+                                     If ``x`` is a ``list``, the Tensors in ``x``
+                                     must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
+        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
+        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
+        
     Returns:
-        Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
+        Tensor: The stacked tensor with same data type as input.
 
     Example:    
         .. code-block:: python
-            import numpy as np
+
             import paddle
-            import paddle.fluid as fluid
+            import numpy as np
 
             data1 = np.array([[1.0, 2.0]])
             data2 = np.array([[3.0, 4.0]])
             data3 = np.array([[5.0, 6.0]])
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(data1)
-                x2 = fluid.dygraph.to_variable(data2)
-                x3 = fluid.dygraph.to_variable(data3)
-                result = paddle.stack([x1, x2, x3], axis=0)
-                # result shape: [3, 1, 2]
-                # result value: [[[1.0, 2.0]],
-                #                [[3.0, 4.0]],
-                #                [[5.0, 6.0]]]
-    """
-
-    helper = LayerHelper('stack', **locals())
-    axis = 0 if axis is None else axis
-
-    if not isinstance(x, list) and not isinstance(x, tuple):
-        x = [x]
-    out = helper.create_variable_for_type_inference(x[0].dtype)
-    if not in_dygraph_mode() and \
-            x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
-                            "number of the elements must be 1, but received %s." % len(x)
-        out_index = helper.create_variable_for_type_inference(dtype="int32")
-        helper.append_op(
-            type='tensor_array_to_tensor',
-            inputs={'X': x[0]},
-            outputs={'Out': [out],
-                     'OutIndex': [out_index]},
-            attrs={'axis': axis,
-                   'use_stack': True})
-    else:
-        helper.append_op(
-            type='stack',
-            inputs={'X': x},
-            outputs={'Y': out},
-            attrs={'axis': axis})
 
-    return out
+            paddle.disable_static()
+            x1 = paddle.to_variable(data1)
+            x2 = paddle.to_variable(data2)
+            x3 = paddle.to_variable(data3)
+
+            out = paddle.stack([x1, x2, x3], axis=0)
+            print(out.shape)  # [3, 1, 2]
+            print(out.numpy())
+            # [[[1., 2.]],
+            #  [[3., 4.]],
+            #  [[5., 6.]]]
+    """
+    return layers.stack(x, axis, name)
 
 
 def split(x, num_or_sections, axis=0, name=None):
     """
-	:alias_main: paddle.split
-        :alias: paddle.tensor.split, paddle.tensor.manipulation.split
-    
     Split the input tensor into multiple sub-Tensors.
     
     Args:
@@ -396,10 +498,10 @@ def split(x, num_or_sections, axis=0, name=None):
             import numpy as np
             import paddle
             
-            paddle.enable_imperative()
+            paddle.disable_static()
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
-            x = paddle.imperative.to_variable(x_np)
+            x = paddle.to_tensor(x_np)
 
             out0, out1, out22 = paddle.split(x, num_or_sections=3, axis=1)
             # out0.shape [3, 3, 5]
@@ -473,21 +575,22 @@ def squeeze(x, axis=None, name=None):
             out.shape = [1, 3, 5]
 
     Args:
-        input (Tensor): The input Tensor. Support data type: float32, float64, int8, int32, int64.
+        x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
         axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
-                          The range of axis is :math:`[-ndim(input), ndim(input))`.
-                          If axis is negative, :math:`axis = axis + ndim(input)`.
-                          If axis is None, all the dimensions of input of size 1 will be removed.
+                          The range of axis is :math:`[-ndim(x), ndim(x))`.
+                          If axis is negative, :math:`axis = axis + ndim(x)`.
+                          If axis is None, all the dimensions of x of size 1 will be removed.
         name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
 
     Returns:
-        Tensor: Output squeezed Tensor. Data type is same as input Tensor.
+        Tensor: Squeezed Tensor with the same data type as input Tensor.
 
     Examples:
         .. code-block:: python
+
             import paddle
 
-            paddle.enable_imperative()
+            paddle.disable_static()
             
             x = paddle.rand([5, 1, 10])
             output = paddle.squeeze(x, axis=1)
@@ -504,133 +607,212 @@ def squeeze(x, axis=None, name=None):
     return layers.squeeze(x, axis, name)
 
 
-def unsqueeze(input, axes, out=None, name=None):
+def unique(x,
+           return_index=False,
+           return_inverse=False,
+           return_counts=False,
+           axis=None,
+           name=None):
     """
-	:alias_main: paddle.unsqueeze
-	:alias: paddle.unsqueeze,paddle.tensor.unsqueeze,paddle.tensor.manipulation.unsqueeze
+    Returns the unique elements of `x` in ascending order.
 
-    Insert single-dimensional entries to the shape of a Tensor. Takes one
-    required argument axes, a list of dimensions that will be inserted.
-    Dimension indices in axes are as seen in the output tensor.
+    Args:
+        x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64.
+        return_index(bool, optional): If True, also return the indices of the input tensor that
+            result in the unique Tensor.
+        return_inverse(bool, optional): If True, also return the indices for where elements in
+            the original input ended up in the returned unique tensor.
+        return_counts(bool, optional): If True, also return the counts for each unique element.
+        axis(int, optional): The axis to apply unique. If None, the input will be flattened.
+            Default: None.
+        name(str, optional): Name for the operation. For more information, please refer to
+            :ref:`api_guide_Name`. Default: None.
+
+    Returns: 
+        tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
+            provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
+            is True. `counts` is provided only if `return_counts` is True.
 
-    For example:
+    Examples:
+        .. code-block:: python
 
-    .. code-block:: text
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([2, 3, 3, 1, 5, 3])
+            x = paddle.to_tensor(x_data)
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [1 2 3 5]
+            _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
+            np_indices = indices.numpy() # [3 0 1 4]
+            np_inverse = inverse.numpy() # [1 2 2 0 3 2]
+            np_counts = counts.numpy() # [1 1 3 1]
+
+            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
+            unique = paddle.unique(x)
+            np_unique = unique.numpy() # [0 1 2 3]
+
+            unique = paddle.unique(x, axis=0)
+            np_unique = unique.numpy() 
+            # [[2 1 3]
+            #  [3 0 1]]
+    """
+    if axis is None:
+        axis = []
+    else:
+        axis = [axis]
+
+    if in_dygraph_mode():
+        out, inverse, indices, counts = core.ops.unique(
+            x, 'dtype',
+            convert_np_dtype_to_dtype_('int32'), 'return_index', return_index,
+            'return_inverse', return_inverse, 'return_counts', return_counts,
+            'axis', axis, "is_sorted", True)
+        outs = [out]
+        if return_index:
+            outs.append(indices)
+        if return_inverse:
+            outs.append(inverse)
+        if return_counts:
+            outs.append(counts)
+
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    check_variable_and_dtype(x, "input",
+                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_type(return_index, 'return_index', bool, 'unique')
+    check_type(return_inverse, 'return_inverse', bool, 'unique')
+    check_type(return_counts, 'return_counts', bool, 'unique')
+    if len(axis) != 0:
+        check_type(axis[0], 'axis', int, 'unique')
+
+    helper = LayerHelper('unique', **locals())
+    attrs = {
+        'dtype': int(core.VarDesc.VarType.INT32),
+        "return_index": return_index,
+        "return_inverse": return_inverse,
+        "return_counts": return_counts,
+        "axis": axis,
+        "is_sorted": True
+    }
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+    outputs = {"Out": out, "Index": inverse}
+    outs = [out]
+    if return_index:
+        indices = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        outputs["Indices"] = indices
+        outs.append(indices)
+    if return_inverse:
+        outs.append(inverse)
+    if return_counts:
+        counts = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
+        outputs["Counts"] = counts
+        outs.append(counts)
+
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
+
+    if len(outs) == 1:
+        return outs[0]
+
+    return tuple(outs)
+
+
+def unsqueeze(x, axis, name=None):
+    """
+	:alias_main: paddle.unsqueeze
+	:alias: paddle.unsqueeze, paddle.tensor.unsqueeze, paddle.tensor.manipulation.unsqueeze
 
-      Given a tensor such that tensor with shape [3, 4, 5],
-      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
+    Insert single-dimensional entries to the shape of input Tensor ``x``. Takes one
+    required argument axis, a dimension or list of dimensions that will be inserted.
+    Dimension indices in axis are as seen in the output tensor.
 
     Args:
-        input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
-        axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
-        name (str|None): Name for this layer.
+        x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
+        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . 
+                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. 
+                                    If ``axis`` is a Tensor, it should be an 1-D Tensor .
+                                    If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
+        name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.
 
     Returns:
-        Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
+        Tensor: Unsqueezed Tensor with the same data type as input Tensor.
 
     Examples:
         .. code-block:: python
-            import numpy as np
+
             import paddle
-            import paddle.fluid as fluid
 
-            with fluid.dygraph.guard():
-                input_1 = np.random.random([5, 10]).astype("int32")
-                # input is a variable which shape is [5, 10]
-                input = fluid.dygraph.to_variable(input_1)
+            paddle.disable_static()
+            x = paddle.rand([5, 10])
+            print(x.shape)  # [5, 10]
+            
+            out1 = paddle.unsqueeze(x, axis=0)
+            print(out1.shape)  # [1, 5, 10]
+            
+            out2 = paddle.unsqueeze(x, axis=[0, 2]) 
+            print(out2.shape)  # [1, 5, 1, 10]
 
-                output = paddle.unsqueeze(input, axes=[1])
-                # output.shape [5, 1, 10]
+            axis = paddle.fluid.dygraph.to_variable([0, 1, 2])
+            out3 = paddle.unsqueeze(x, axis=axis) 
+            print(out3.shape)  # [1, 1, 1, 5, 10]
+            
     """
-    if not isinstance(axes, (int, list, tuple, Variable)):
-        raise TypeError(
-            "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
-            "received %s." % (type(axes)))
-    helper = LayerHelper("unsqueeze2", **locals())
-    inputs = {"X": input}
-    attrs = {}
-
-    def _to_Variable_list(one_list):
-        Variable_list = []
-        for ele in one_list:
-            if isinstance(ele, Variable):
-                ele.stop_gradient = True
-                Variable_list.append(ele)
-            else:
-                assert (isinstance(ele, int))
-                temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1], 'int32', ele, force_cpu=True, out=temp_out)
-                Variable_list.append(temp_out)
-        return Variable_list
-
-    if isinstance(axes, int):
-        axes = [axes]
-    if isinstance(axes, Variable):
-        axes.stop_gradient = True
-        inputs["AxesTensor"] = axes
-    elif isinstance(axes, (list, tuple)):
-        contain_var = not all(not isinstance(ele, Variable) for ele in axes)
-        if contain_var:
-            inputs["AxesTensorList"] = _to_Variable_list(axes)
-        else:
-            attrs["axes"] = axes
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="unsqueeze2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    if isinstance(axis, int):
+        axis = [axis]
 
-    return out
+    return layers.unsqueeze(x, axis, name)
 
 
-def gather(input, index, overwrite=True):
+def gather(x, index, axis=None, name=None):
     """
-	:alias_main: paddle.gather
-	:alias: paddle.gather,paddle.tensor.gather,paddle.tensor.manipulation.gather
 
     **Gather Layer**
 
-    Output is obtained by gathering entries of the outer-most dimension
-    of X indexed by `index` and concatenate them together.
-
-    .. math::
-
-        Out = X[Index]
-
+    Output is obtained by gathering entries of ``axis``
+    of ``x`` indexed by ``index`` and concatenate them together.
 
     .. code-block:: text
 
 
                 Given:
 
-                X = [[1, 2],
+                x = [[1, 2],
                      [3, 4],
                      [5, 6]]
 
-                Index = [1, 2]
+                index = [1, 2]
+                axis=[0]
 
                 Then:
 
-                Out = [[3, 4],
+                out = [[3, 4],
                        [5, 6]]
     Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is
+        x (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
             float16 (only for GPU).
-        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
-        overwrite (bool, optional): The mode that updating the grad when has same index.
-            If True, use the overwrite mode to update the grad of the same index,
-            if False, use the accumulate mode to update the grad of the same index.
-            Default value is True.
-
-
+        index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
+        axis (Tensor|int, optional): The axis of input to be gathered, it's can be int or a Tensor with data type is int32 or int64. The default value is None, if None, the ``axis`` is 0.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        output (Variable): The output is a tensor with the same rank as input.
+        output (Tensor): The output is a tensor with the same rank as ``x``.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
+        TypeError: ``axis`` must be a Tensor or int and the data type of ``index`` must be int32 or int64 when it's a Tensor.
 
     Examples:
 
@@ -638,26 +820,41 @@ def gather(input, index, overwrite=True):
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
-
 
-            with fluid.dygraph.guard():
-                input_1 = np.array([[1,2],[3,4],[5,6]])
-                index_1 = np.array([0,1])
-                input = fluid.dygraph.to_variable(input_1)
-                index = fluid.dygraph.to_variable(index_1)
-                output = paddle.gather(input, index)
-                # expected output: [[1,2],[3,4]]
+            paddle.disable_static()
+            input_1 = np.array([[1,2],[3,4],[5,6]])
+            index_1 = np.array([0,1])
+            input = paddle.to_tensor(input_1)
+            index = paddle.to_tensor(index_1)
+            output = paddle.gather(input, index, axis=0)
+            # expected output: [[1,2],[3,4]]
     """
+    if axis is None:
+        axis = 0
+    axis_tensor = axis
+    if not isinstance(axis, Variable):
+        axis_tensor = fill_constant(shape=[1], dtype='int64', value=axis)
+    if in_dygraph_mode():
+        return core.ops.gather(x, index, axis_tensor)
+
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'gather')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
+    if isinstance(axis, Variable):
+        check_variable_and_dtype(axis, 'axis', ['int32', 'int64'], 'gather')
+    else:
+        check_type(axis, 'axis', (int), 'gather')
+
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="gather",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": out},
-        attrs={'overwrite': overwrite})
+        inputs={"X": x,
+                "Index": index,
+                "Axis": axis_tensor},
+        outputs={"Out": out})
     return out
 
 
@@ -716,3 +913,523 @@ def unbind(input, axis=0):
         outputs={"Out": outs},
         attrs={"axis": axis})
     return outs
+
+
+def scatter(x, index, updates, overwrite=True, name=None):
+    """
+    **Scatter Layer**
+    Output is obtained by updating the input on selected indices based on updates.
+    
+    .. code-block:: python
+        import numpy as np
+        #input:
+        x = np.array([[1, 1], [2, 2], [3, 3]])
+        index = np.array([2, 1, 0, 1])
+        # shape of updates should be the same as x
+        # shape of updates with dim > 1 should be the same as input
+        updates = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
+        overwrite = False
+        # calculation:
+        if not overwrite:
+            for i in range(len(index)):
+                x[index[i]] = np.zeros((2))
+        for i in range(len(index)):
+            if (overwrite):
+                x[index[i]] = updates[i]
+            else:
+                x[index[i]] += updates[i]
+        # output:
+        out = np.array([[3, 3], [6, 6], [1, 1]])
+        out.shape # [3, 2]
+
+    **NOTICE**: The order in which updates are applied is nondeterministic, 
+    so the output will be nondeterministic if index contains duplicates.
+
+    Args:
+        x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
+        index (Tensor): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
+        updates (Tensor): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
+        overwrite (bool): The mode that updating the output when there are same indices. 
+          If True, use the overwrite mode to update the output of the same index,
+	      if False, use the accumulate mode to update the output of the same index.Default value is True.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
+ 
+    Returns:
+        Tensor: The output is a Tensor with the same shape as x.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+
+            x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32)
+            
+            x = paddle.to_tensor(x_data)
+            index = paddle.to_tensor(index_data)
+            updates = paddle.to_tensor(updates_data)
+  
+            output1 = paddle.scatter(x, index, updates, overwrite=False)
+            # [[3., 3.],
+            #  [6., 6.],
+            #  [1., 1.]]
+
+            output2 = paddle.scatter(x, index, updates, overwrite=True)
+            # CPU device:
+            # [[3., 3.],
+            #  [4., 4.],
+            #  [1., 1.]]
+            # GPU device maybe have two results because of the repeated numbers in index
+            # result 1:
+            # [[3., 3.],
+            #  [4., 4.],
+            #  [1., 1.]]
+            # result 2:
+            # [[3., 3.],
+            #  [2., 2.],
+            #  [1., 1.]]
+    """
+    if in_dygraph_mode():
+        return core.ops.scatter(x, index, updates, 'overwrite', overwrite)
+
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'scatter')
+    check_type(overwrite, 'overwrite', bool, 'scatter')
+    helper = LayerHelper('scatter', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": x,
+                "Ids": index,
+                "Updates": updates},
+        attrs={'overwrite': overwrite},
+        outputs={"Out": out})
+    return out
+
+
+def chunk(x, chunks, axis=0, name=None):
+    """
+    Split the input tensor into multiple sub-Tensors.
+    
+    Args:
+        x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
+        chunks(int): The number of tensor to be split along the certain axis.
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+            ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
+            If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+    Returns:
+        list(Tensor): The list of segmented Tensors.
+    Raises:
+        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
+        TypeError: ``chunks`` is not int.
+        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    Example:
+        .. code-block:: python
+            
+            import numpy as np
+            import paddle
+            
+            paddle.disable_static()
+            # x is a Tensor which shape is [3, 9, 5]
+            x_np = np.random.random([3, 9, 5]).astype("int32")
+            x = paddle.to_tensor(x_np)
+
+            out0, out1, out22 = paddle.chunk(x, chunks=3, axis=1)
+            # out0.shape [3, 3, 5]
+            # out1.shape [3, 3, 5]
+            # out2.shape [3, 3, 5]
+
+            
+            # axis is negative, the real axis is (rank(x) + axis) which real
+            # value is 1.
+            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
+            # out0.shape [3, 3, 5]
+            # out1.shape [3, 3, 5]
+            # out2.shape [3, 3, 5]
+    """
+    check_type(chunks, 'chunks', (int), 'chunk')
+    return paddle.fluid.layers.split(
+        input=x, num_or_sections=chunks, dim=axis, name=name)
+
+
+def tile(x, repeat_times, name=None):
+    """
+
+    Construct a new Tensor by repeating ``x`` the number of times given by ``repeat_times``.
+    After tiling, the value of the i'th dimension of the output is equal to ``x.shape[i]*repeat_times[i]``.
+
+    Both the number of dimensions of ``x`` and the number of elements in ``repeat_times`` should be less than or equal to 6.
+
+    Args:
+        x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64.
+        repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements
+            should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. The data type is the same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            np_data = np.array([1, 2, 3]).astype('int32')
+            data = paddle.to_tensor(np_data)
+            out = paddle.tile(data, repeat_times=[2, 1])
+            np_out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+
+            out = paddle.tile(data, repeat_times=[2, 2])
+            np_out = out.numpy()
+            # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
+
+            np_repeat_times = np.array([2, 1]).astype("int32")
+            repeat_times = paddle.to_tensor(np_repeat_times)
+            out = paddle.tile(data, repeat_times=repeat_times)
+            np_out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+    """
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
+    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
+    if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
+        raise ValueError(
+            "When the date type is bool for the input 'x' of tile op, you "
+            "must set its stop_gradient to be True by "
+            "some_var.stop_gradient == True supporting some_var is the input.")
+
+    if in_dygraph_mode():
+        return core.ops.tile(x, 'repeat_times', repeat_times)
+
+    helper = LayerHelper('tile', **locals())
+
+    inputs = {"X": [x]}
+    attrs = {}
+
+    def get_attr_repeat_times(list_repeat_times):
+        attrs_repeat_times = []
+        for idx, times in enumerate(list_repeat_times):
+            if isinstance(times, Variable):
+                attrs_repeat_times.append(-1)
+            else:
+                attrs_repeat_times.append(times)
+                assert times > 0, (
+                    "All elements in repeat_times must be positive for tile.")
+        return attrs_repeat_times
+
+    if isinstance(repeat_times, Variable):
+        repeat_times.stop_gradient = True
+        inputs['RepeatTimes'] = repeat_times
+        attrs['repeat_times'] = [-1]
+    elif isinstance(repeat_times, (list, tuple)):
+        attrs['repeat_times'] = get_attr_repeat_times(repeat_times)
+        if utils._contain_var(repeat_times):
+            inputs['repeat_times_tensor'] = utils._convert_to_tensor_list(
+                repeat_times)
+
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='tile', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
+def expand_as(x, y, name=None):
+    """
+
+    Expand the input tensor ``x`` to the same shape as the input tensor ``y``.
+
+    Both the number of dimensions of ``x`` and ``y`` must be less than or equal to 6, and the number of dimensions of ``y`` must be greather than or equal to that of ``x``. The dimension to expand must have a value of 1.
+
+    Args:
+        x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
+        y (Tensor): The input tensor that gives the shape to expand to.
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor: A Tensor with the same shape as ``y``. The data type is the same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            np_data_x = np.array([1, 2, 3]).astype('int32')
+            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype('int32')
+            data_x = paddle.to_tensor(np_data_x)
+            data_y = paddle.to_tensor(np_data_y)
+            out = paddle.expand_as(data_x, data_y)
+            np_out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+    """
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as')
+    check_type(y, 'y', Variable, 'expand_as')
+
+    if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
+        raise ValueError(
+            "When the data type of input 'x' for expand_as is bool, "
+            "you must set its stop_gradient to be False by "
+            "some_var.stop_gradient = True, supporting "
+            "some_var as the input 'x'.")
+    inputs = {"X": [x], "target_tensor": [y]}
+
+    if in_dygraph_mode():
+        return core.ops.expand_as_v2(x, y)
+
+    helper = LayerHelper('expand_as', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type='expand_as_v2', inputs=inputs, outputs={'Out': out})
+    return out
+
+
+def expand(x, shape, name=None):
+    """
+
+    Expand the input tensor to a given shape.
+
+    Both the number of dimensions of ``x`` and the number of elements in ``shape`` should be less than or equal to 6. The dimension to expand must have a value 1.
+
+
+    Args:
+        x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
+        shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            The value -1 in shape means keeping the corresponding dimension unchanged.
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        N-D Tensor: A Tensor with the given shape. The data type is the same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            np_data = np.array([1, 2, 3]).astype('int32')
+            data = paddle.to_tensor(np_data)
+            out = paddle.expand(data, shape=[2, 3])
+            out = out.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
+    """
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+    check_type(shape, 'shape', (list, tuple, Variable), 'expand')
+
+    inputs = {"X": [x]}
+    attrs = {}
+    if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
+        raise ValueError("When the data type of input 'x' for expand is bool, "
+                         "you must set its stop_gradient to be False by "
+                         "some_var.stop_gradient = True, supporting "
+                         "some_var as the input.")
+
+    if in_dygraph_mode():
+        return core.ops.expand_v2(x, 'shape', shape)
+
+    helper = LayerHelper('expand', **locals())
+
+    def get_attr_expand_shape(list_expand_shape):
+        attrs_expand_shape = []
+        for idx, shape in enumerate(list_expand_shape):
+            if isinstance(shape, Variable):
+                attrs_expand_shape.append(-1)
+            else:
+                attrs_expand_shape.append(shape)
+                assert shape > 0 or shape == -1, (
+                    "All elements in shape of expand must be positive or -1.")
+        return attrs_expand_shape
+
+    if isinstance(shape, Variable):
+        shape.stop_gradient = True
+        inputs['Shape'] = shape
+    elif isinstance(shape, (list, tuple)):
+        attrs['shape'] = get_attr_expand_shape(shape)
+        if utils._contain_var(shape):
+            inputs['expand_shapes_tensor'] = utils._convert_to_tensor_list(
+                shape)
+
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
+broadcast_to = expand
+
+
+def reshape(x, shape, name=None):
+    """
+    :alias_main: paddle.reshape
+	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
+
+    This operator changes the shape of ``x`` without changing its data.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The index of 0s in shape can not exceed
+    the dimension of x.
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
+                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
+                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.
+                            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor: A reshaped Tensor with the same data type as ``x``.
+
+    Raises:
+        ValueError: If more than one elements of ``shape`` is -1.
+        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
+        ValueError: If the elements in ``shape`` is negative except -1.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            data = np.random.random([2, 4, 6]).astype("float32")
+            x = paddle.to_tensor(data)
+
+            positive_four = paddle.fill_constant([1], "int32", 4)
+
+            out_1 = paddle.reshape(x, [-1, 0, 3, 2])
+            # the shape of out_1 is [2,4,3,2].
+
+            out_2 = paddle.reshape(x, shape=[positive_four, 12])
+            # the shape of out_2 is [4, 12].
+
+            shape_tensor = paddle.to_tensor(np.array([8, 6]).astype("int32"))
+            out_3 = paddle.reshape(x, shape=shape_tensor)
+            # the shape of out_2 is [8, 6].
+    """
+    return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
+
+
+def gather_nd(x, index, name=None):
+    """
+
+    This function is actually a high-dimensional extension of :code:`gather`
+    and supports for simultaneous indexing by multiple axes. :attr:`index` is a
+    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional
+    tensor of :attr:`index` into :attr:`input`, where each element defines
+    a slice of params:
+
+    .. math::
+
+        output[(i_0, ..., i_{K-2})] = input[index[(i_0, ..., i_{K-2})]]
+
+    Obviously, :code:`index.shape[-1] <= input.rank` . And, the output tensor has
+    shape :code:`index.shape[:-1] + input.shape[index.shape[-1]:]` .
+
+    .. code-block:: text
+
+            Given:
+                x =  [[[ 0,  1,  2,  3],
+                       [ 4,  5,  6,  7],
+                       [ 8,  9, 10, 11]],
+                      [[12, 13, 14, 15],
+                       [16, 17, 18, 19],
+                       [20, 21, 22, 23]]]
+                x.shape = (2, 3, 4)
+
+            * Case 1:
+                index = [[1]]
+
+                gather_nd(x, index)
+                         = [x[1, :, :]]
+                         = [[12, 13, 14, 15],
+                            [16, 17, 18, 19],
+                            [20, 21, 22, 23]]
+
+            * Case 2:
+                index = [[0,2]]
+
+                gather_nd(x, index)
+                         = [x[0, 2, :]]
+                         = [8, 9, 10, 11]
+
+            * Case 3:
+                index = [[1, 2, 3]]
+
+                gather_nd(x, index)
+                         = [x[1, 2, 3]]
+                         = [23]
+
+    Args:
+        x (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
+        index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
+                        Its dtype should be int32, int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of float32, float64, int32 and int64.
+        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            np_x = np.array([[[1, 2], [3, 4], [5, 6]],
+                             [[7, 8], [9, 10], [11, 12]]])
+            np_index = [[0, 1]]
+            x = paddle.to_tensor(np_x)
+            index = paddle.to_tensor(np_index)
+            
+            output = paddle.gather_nd(x, index) #[[3, 4]]
+
+    """
+
+    return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
old mode 100644
new mode 100755
index 878fdbfc1f5761317fb5f8a32bbee5f5ef7f5bc0..9dfb31a5ac25b2afc9fe52bfc8bab5ad277d80b8
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -21,7 +21,7 @@ from ..fluid import layers
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..fluid.layers.layer_function_generator import _generate_doc_string_
+from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
 import sys
 
 # TODO: define math functions
@@ -33,12 +33,9 @@ from ..fluid.layers import ceil    #DEFINE_ALIAS
 from ..fluid.layers import cos    #DEFINE_ALIAS
 from ..fluid.layers import sinh    #DEFINE_ALIAS
 from ..fluid.layers import cosh    #DEFINE_ALIAS
-from ..fluid.layers import cumsum    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_floordiv    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_max    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_min    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_mod    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_mul    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_pow    #DEFINE_ALIAS
@@ -54,15 +51,18 @@ from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
 from ..fluid.layers import round    #DEFINE_ALIAS
 from ..fluid.layers import rsqrt    #DEFINE_ALIAS
 from ..fluid.layers import scale    #DEFINE_ALIAS
-from ..fluid.layers import sign    #DEFINE_ALIAS
 from ..fluid.layers import square    #DEFINE_ALIAS
 from ..fluid.layers import stanh    #DEFINE_ALIAS
 from ..fluid.layers import atan    #DEFINE_ALIAS
 from ..fluid.layers import erf    #DEFINE_ALIAS
+from ..fluid.layers import sqrt    #DEFINE_ALIAS
+from ..fluid.layers import sin    #DEFINE_ALIAS
 
 from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
 from ..fluid.layers import sums    #DEFINE_ALIAS
+from ..fluid import layers
+import paddle
 
 __all__ = [
         'abs',
@@ -76,8 +76,6 @@ __all__ = [
         'elementwise_add',
         'elementwise_div',
         'elementwise_floordiv',
-        'elementwise_max',
-        'elementwise_min',
         'elementwise_mod',
         'elementwise_pow',
         'elementwise_sub',
@@ -85,8 +83,10 @@ __all__ = [
         'floor',
         'increment',
         'log',
+        'logsumexp',
         'mul',
         'multiplex',
+        'prod',
         'pow',
         'reciprocal',
         'reduce_max',
@@ -107,9 +107,15 @@ __all__ = [
         'tanh',
         'elementwise_sum',
         'max',
+        'maximum',
         'min',
+        'minimum',
         'mm',
-        'div',
+        'divide',
+        'floor_divide',
+        'remainder',
+        'mod',
+        'floor_mod',
         'multiply',
         'add',
         'atan',
@@ -119,71 +125,27 @@ __all__ = [
         'erf',
         'addcmul',
         'addmm',
-        'clamp',
+        'clip',
         'trace',
-        'kron'
+        'kron',
+        'isfinite',
+        'isinf',
+        'isnan'
 ]
-
-
 # yapf: enable.
 
+_supported_int_dtype_ = [
+    VarDesc.VarType.UINT8,
+    VarDesc.VarType.INT8,
+    VarDesc.VarType.INT16,
+    VarDesc.VarType.INT32,
+    VarDesc.VarType.INT64,
+]
 
-def generate_op_noattr(op_type):
-    """Register the Python layer for an Operator without Attribute..
-
-    Args:
-       op_type: The name of the operator to be created.
-
-    This function takes in the operator type (sin, tanh etc) and
-    creates the operator functionality.
-
-    """
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-
-    def func(x, name=None):
-        if in_dygraph_mode():
-            op = getattr(core.ops, op_type)
-            return op(x)
-
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 op_type)
-        helper = LayerHelper(op_type, **locals())
-
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": out})
-        return out
-
-    func.__name__ = op_type
-    func.__doc__ = _generate_doc_string_(
-        op_proto,
-        additional_args_lines=[
-            "name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`.\n    "
-            "out(Variable, optional): The default value is None. Optional output can be any created Variable that meets the requirements to store the result of operation. if out is None, a new Varibale will be create to store the result."
-        ])
-    func.__doc__ = func.__doc__ + """
-
-Return type
-  Variable
-Examples:
-    .. code-block:: python
-
-        import numpy as np
-
-        import paddle
-        import paddle.fluid as fluid
-
-        inputs = fluid.data(name="x", shape = [None, 4], dtype='float32')
-        output = paddle.%s(inputs)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-
-        #input.shape=1X4, batch_size=1
-        img = np.array([[1.0, 2.0, 3.0, 4.0]]).astype(np.float32)
-        res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
-        print(res)
-""" % op_type
-    return func
+_supported_float_dtype_ = [
+    VarDesc.VarType.FP32,
+    VarDesc.VarType.FP64,
+]
 
 @templatedoc()
 def pow(input, exponent, name=None):
@@ -245,17 +207,6 @@ def pow(input, exponent, name=None):
     return out
 
 
-__ops__noattr__ = [
-    'atan',
-    'sin',
-    'sqrt',
-    'tanh',
-]
-
-for _OP in set(__ops__noattr__):
-    globals()[_OP] = generate_op_noattr(_OP)
-
-
 @dygraph_only
 def _elementwise_op_in_dygraph(x,
                                y,
@@ -304,251 +255,367 @@ def _elementwise_op(helper):
     return helper.append_activation(out)
 
 
-def add(x, y, alpha=1, name=None):
+def add(x, y, name=None):
     """
 Examples:
 
-    .. code-block:: python
+    ..  code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
+        paddle.disable_static()
+        np_x = np.array([2, 3, 4]).astype('float64')
+        np_y = np.array([1, 5, 2]).astype('float64')
+        x = paddle.to_variable(np_x)
+        y = paddle.to_variable(np_y)
+        z = paddle.add(x, y)
+        np_z = z.numpy()
+        print(np_z)  # [3., 8., 6. ]
 
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z1 = paddle.add(x, y)
-        z2 = paddle.add(x, y, alpha=10)
-        # z = x + y
+    """
+    op_type = 'elementwise_add'
+    axis = -1
+    if in_dygraph_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, op_name=op_type)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z1.name, z2.name])
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        print(z_value[0]) # [3., 8., 6.]
-        print(z_value[1]) # [12. 53. 24.]
 
+def divide(x, y, name=None):
+    """
+    Divide two tensors element-wise. The equation is:
 
-    .. code-block:: python
+    .. math::
+        out = x / y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.divide`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((4, 5)).astype('float32')
-            }
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        x = fluid.data(name="x", shape=[2, 3, 4, 5], dtype='float32')
-        y = fluid.data(name="y", shape=[4, 5], dtype='float32')
-        z = paddle.add(x, y, name='z')
-        # z = x + y
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    Examples:
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+        ..  code-block:: python
 
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+            import paddle
+            import numpy as np
 
+            paddle.disable_static()
 
-    ..  code-block:: python
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.divide(x, y)
+            print(z.numpy())  # [2., 0.6, 2.]
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    """
+    op_type = 'elementwise_div'
+    axis = -1
+    act = None
+    if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        # rule 2: both the inputs are not Tensor
+        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
+            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
+
+        # rule 3: both the inputs are Tensor
+        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+            elif x.dtype in _supported_int_dtype_:
+                x = x.astype(paddle.get_default_dtype())
+                y = y.astype(paddle.get_default_dtype())
+
+        # rule 4: x is Tensor, y is scalar
+        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            if x.dtype in _supported_int_dtype_:
+                x = x.astype(paddle.get_default_dtype())
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
+        # rule 5: x is scalar, y is Tensor
+        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype in _supported_int_dtype_:
+                y = y.astype(paddle.get_default_dtype())
+            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
 
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = paddle.add(x, y)
-        # z = x / y
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    # rule 2: both the inputs are not Tensor
+    elif not isinstance(x, Variable) and not isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
+        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+        elif x.dtype in _supported_int_dtype_:
+            x = paddle.cast(x, paddle.get_default_dtype())
+            y = paddle.cast(y, paddle.get_default_dtype())
+
+    # rule 4: x is Tensor, y is scalar
+    elif isinstance(x, Variable) and not isinstance(y, Variable):
+        if x.dtype in _supported_int_dtype_:
+            x = paddle.cast(x, paddle.get_default_dtype())
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
+
+    # rule 5: x is scalar, y is Tensor
+    elif not isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype in _supported_int_dtype_:
+            y = paddle.cast(y, paddle.get_default_dtype())
+        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
 
+def floor_divide(x, y, name=None):
+    """
+    Floor divide two tensors element-wise. The equation is:
 
-    ..  code-block:: python
+    .. math::
+        out = x // y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.floor_divide`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        x = fluid.data(name="x", shape=[3], dtype="float32")
-        y = fluid.data(name='y', shape=[3], dtype='float32')
-        z = paddle.add(x, y)
+    Args:
+        x (Tensor): the input tensor, it's data type should be int32, int64.
+        y (Tensor): the input tensor, it's data type should be int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        data1 = np.array([2, 3, 4], dtype='float32')
-        data2 = np.array([1, 5, 2], dtype='float32')
-        z_value = exe.run(feed={'x': data1,
-                                'y': data2},
-                                fetch_list=[z])
-        print(z_value[0]) # [3. 8. 6.]
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.add(x, y, alpha=-0.5)
-            np_z = z.numpy()
-            print(np_z)  # [1.5, 0.5, 3. ]
+            paddle.disable_static()
+
+            np_x = np.array([2, 3, 8, 7])
+            np_y = np.array([1, 5, 3, 3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.floor_divide(x, y)
+            print(z.numpy())  # [2, 0, 2, 2]
 
     """
-    op_type = 'elementwise_add'
+    op_type = 'elementwise_floordiv'
     axis = -1
-    act = None
-    if alpha != 1:
-        y = scale(y, scale=alpha)
     if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+        # rule 2: both the inputs are not Tensor
+        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
+            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
+
+        # rule 3: both the inputs are Tensor
+        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+        # rule 4: x is Tensor, y is scalar
+        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
+
+        # rule 5: x is scalar, y is Tensor
+        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
+            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
+
         return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name=op_type)
+            x, y, axis=axis, op_name=op_type)
+
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
+
+    # rule 2: both the inputs are not Tensor
+    elif not isinstance(x, Variable) and not isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
+        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
+
+    # rule 3: both the inputs are Tensor
+    elif isinstance(x, Variable) and isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
+
+    # rule 4: x is Tensor, y is scalar
+    elif isinstance(x, Variable) and not isinstance(y, Variable):
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
+
+    # rule 5: x is scalar, y is Tensor
+    elif not isinstance(x, Variable) and isinstance(y, Variable):
+        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
 
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
-def div(x, y, name=None):
+def remainder(x, y, name=None):
     """
-Examples:
+    Mod two tensors element-wise. The equation is:
 
-    .. code-block:: python
+    .. math::
+        out = x \% y
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    **Note**:
+    ``paddle.remainder`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
+    Args:
+        x (Tensor): the input tensor, it's data type should be int32, int64.
+        y (Tensor): the input tensor, it's data type should be int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = paddle.div(x, y)
-        # z = x / y
+    Returns:
+        N-D Tensor. A location into which the result is stored. It's dimension equals with $x$.
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+    Examples:
 
-        print(z_value) # [2., 0.6, 2.]
+        ..  code-block:: python
 
+            import paddle
+            import numpy as np
 
-    .. code-block:: python
+            paddle.disable_static()
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+            np_x = np.array([2, 3, 8, 7])
+            np_y = np.array([1, 5, 3, 3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.remainder(x, y)
+            print(z.numpy())  # [0, 3, 2, 1]
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((4, 5)).astype('float32')
-            }
+    """
+    op_type = 'elementwise_mod'
+    axis = -1
+    if in_dygraph_mode():
+        # rule 1 : avoid numpy.ndarray
+        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+            raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
 
-        x = fluid.data(name="x", shape=[2, 3, 4, 5], dtype='float32')
-        y = fluid.data(name="y", shape=[4, 5], dtype='float32')
-        z = paddle.div(x, y, name='z')
-        # z = x / y
+        elif not isinstance(x, paddle.Tensor):
+            raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        # rule 3: both the inputs are Tensor
+        elif isinstance(y, paddle.Tensor):
+            if y.dtype != x.dtype:
+                raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
+                                "But x is {}, y is {}".format(x.dtype, y.dtype))
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+        # rule 4: x is Tensor, y is scalar
+        elif not isinstance(y, paddle.Tensor):
+            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
 
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, op_name=op_type)
 
+    # rule 1 : avoid numpy.ndarray
+    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
+        raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
 
-    ..  code-block:: python
+    elif not isinstance(x, Variable):
+        raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    # rule 3: both the inputs are Tensor
+    elif isinstance(y, Variable):
+        if y.dtype != x.dtype:
+            raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
+                            "But x is {}, y is {}".format(x.dtype, y.dtype))
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
+    # rule 4: x is Tensor, y is scalar
+    elif not isinstance(y, paddle.Tensor):
+        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
 
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = paddle.div(x, y)
-        # z = x / y
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value[0])
-        print(z_value[0].shape) # z.shape=[2,3,4,5]
+mod = remainder  #DEFINE_ALIAS
+floor_mod = remainder  #DEFINE_ALIAS
 
 
-    ..  code-block:: python
+def multiply(x, y, axis=-1, name=None):
+    """
+    multiply two tensors element-wise. The equation is:
 
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
+    .. math::
+        out = x * y
 
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = fluid.dygraph.to_variable(np_x)
-            y = fluid.dygraph.to_variable(np_y)
-            z = paddle.div(x, y)
-            np_z = z.numpy()
-            print(np_z)  # [2., 0.6, 2.]
+    **Note**:
+    ``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
+
+    Args:
+        x (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
+
+    Examples:
+
+        ..  code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y)
+            print(res.numpy()) # [[5, 12], [21, 32]]
+
+            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+            y_data = np.array([1, 2], dtype=np.float32)
+            x = paddle.to_tensor(x_data)
+            y = paddle.to_tensor(y_data)
+            res = paddle.multiply(x, y, axis=1)
+            print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
     """
-    op_type = 'elementwise_div'
-    axis = -1
+    op_type = 'elementwise_mul'
     act = None
+    if x.dtype != y.dtype:
+        raise TypeError(
+            'Input tensors must be same type, but received type of x: %s, type of y: %s '
+            % (x.dtype, y.dtype))
+
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
-
-def multiply(x, y, axis=-1, name=None):
+def maximum(x, y, axis=-1, name=None):
     """
-	:alias_main: paddle.multiply
-	:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
-
 Examples:
 
     .. code-block:: python
@@ -556,131 +623,201 @@ Examples:
         import paddle
         import numpy as np
 
-        paddle.enable_imperative()
+        paddle.disable_static()
+  
         x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
         y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
-        res = paddle.multiply(x, y)
-        print(res.numpy()) # [[5, 12], [21, 32]]
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y)
+        print(res.numpy())
+        #[[5. 6.]
+        # [7. 8.]]
 
         x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
         y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.imperative.to_variable(x_data)
-        y = paddle.imperative.to_variable(y_data)
-        res = paddle.multiply(x, y, axis=1)
-        print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
-
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y, axis=1)
+        print(res.numpy())
+        #[[[1. 2. 3.]
+        #  [2. 2. 3.]]]
+
+        x_data = np.array([2, 3, 5], dtype=np.float32)
+        y_data = np.array([1, 4, np.nan], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y)
+        print(res.numpy())
+        #[ 2.  4. nan]
+
+        x_data = np.array([5, 3, np.inf], dtype=np.float32)
+        y_data = np.array([1, 4, 5], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.maximum(x, y)
+        print(res.numpy())
+        #[ 5.  4. inf]
     """
-    op_type = 'elementwise_mul'
+    op_type = 'elementwise_max'
     act = None
     if in_dygraph_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
+def minimum(x, y, axis=-1, name=None):
+    """
+Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+        paddle.disable_static()
+  
+        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y)
+        print(res.numpy())
+        #[[1. 2.]
+        # [3. 4.]]
+
+        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+        y_data = np.array([1, 2], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y, axis=1)
+        print(res.numpy())
+        #[[[1. 1. 1.]
+        #  [2. 2. 2.]]]
+
+        x_data = np.array([2, 3, 5], dtype=np.float32)
+        y_data = np.array([1, 4, np.nan], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y)
+        print(res.numpy())
+        #[ 1.  3. nan]
+
+        x_data = np.array([5, 3, np.inf], dtype=np.float32)
+        y_data = np.array([1, 4, 5], dtype=np.float32)
+        x = paddle.to_variable(x_data)
+        y = paddle.to_variable(y_data)
+        res = paddle.minimum(x, y)
+        print(res.numpy())
+        #[1. 3. 5.]
+    """
+    op_type = 'elementwise_min'
+    act = None
+    if in_dygraph_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
+    return _elementwise_op(LayerHelper(op_type, **locals()))
 
 for func in [
         add,
-        div,
-        multiply,
+        maximum,
+        minimum,
+        multiply
 ]:
-    proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div', 'multiply': 'elementwise_mul'}
+    proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div', 'maximum': 'elementwise_max', 'minimum': 'elementwise_min', 'multiply': 'elementwise_mul'}
     op_proto = OpProtoHolder.instance().get_op_proto(proto_dict[func.__name__])
-    if func.__name__ in ['add']:
-        alias_main = ':alias_main: paddle.%(func)s' % {'func': func.__name__}
-        alias = ':alias: paddle.%(func)s, paddle.tensor.%(func)s, paddle.tensor.math.%(func)s' % {'func': func.__name__}
-
-        additional_args_lines = [
-            "alpha (int|float, optional): The alpha factor of the input. Default is 1. If alpha is not 1, the equation becomes Out = X + alpha * Y.",
-            "name (string, optional): Name of the output. \
-            Default is None. It's used to print debug info for developers. Details: \
-            :ref:`api_guide_Name` "
-        ]
-    else:
-        additional_args_lines = [
-            "name (string, optional): Name of the output. \
-            Default is None. It's used to print debug info for developers. Details: \
-            :ref:`api_guide_Name` "
-        ]
 
-    func.__doc__ = alias_main + """\n""" + alias + """\n""" + _generate_doc_string_(
+    additional_args_lines = [
+        "name (string, optional): Name of the output. \
+        Default is None. It's used to print debug info for developers. Details: \
+        :ref:`api_guide_Name` "
+    ]
+
+    func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=additional_args_lines,
         skip_attrs_set={"x_data_format", "y_data_format", "axis",
-            "use_quantizer", "Scale_x", "Scale_y", "Scale_out"
+            "use_quantizer", "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
         }) + """\n""" + str(func.__doc__)
 
-def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
-    """
-	:alias_main: paddle.sum
-	:alias: paddle.sum,paddle.tensor.sum,paddle.tensor.math.sum
 
+def sum(x, axis=None, dtype=None, keepdim=False, name=None):
+    """
     Computes the sum of tensor elements over the given dimension.
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
-            float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the sum is performed. If
-            :attr:`None`, sum all elements of :attr:`input` and return a
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        axis (int|list|tuple, optional): The dimensions along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`x` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-            the dimension to reduce is :math:`rank + dim[i]`.
-        dtype(str, optional): The dtype of output tensor. The default value is None, the dtype
-            of output is the same as input tensor.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            range :math:`[-rank(x), rank(x))`. If :math:`axis[i] < 0`,
+            the dimension to reduce is :math:`rank + axis[i]`.
+        dtype (str, optional): The dtype of output Tensor. The default value is None, the dtype
+            of output is the same as input Tensor `x`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result Tensor will have one fewer dimension
+            than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None. Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, results of summation operation on the specified dim of input tensor,
-        it's data type is the same as input's Tensor.
+        Tensor: Results of summation operation on the specified axis of input Tensor `x`,
+        it's data type is the same as `x`.
 
     Raises:
-        ValueError, the :attr:`dtype` must be float64 or int64.
+        ValueError: The :attr:`dtype` must be float64 or int64.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
 
     Examples:
         .. code-block:: python
 
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
+            paddle.disable_static()
+
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
+            x_data = np.array([[0.2, 0.3, 0.5, 0.9],[0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x = paddle.to_variable(x_data)
             out1 = paddle.sum(x)  # [3.5]
-            out2 = paddle.sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
-            out3 = paddle.sum(x, dim=-1)  # [1.9, 1.6]
-            out4 = paddle.sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+            out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
+            out3 = paddle.sum(x, axis=-1)  # [1.9, 1.6]
+            out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
 
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            out5 = paddle.sum(y, dim=[1, 2]) # [10, 26]
-            out6 = paddle.sum(y, dim=[0, 1]) # [16, 20]
-
+            y_data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype('float32')
+            y = paddle.to_variable(y_data)
+            out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
+            out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
     """
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+    if axis is not None and not isinstance(axis, (list, tuple)):
+        axis = [axis]
+
+    if not axis:
+        reduce_all_flag = True
+    else:
+        if len(axis) == len(x.shape):
+            reduce_all_flag = True
+        else:
+            reduce_all_flag = False
+
     attrs = {
-        'dim': dim if dim != None and dim != [] else [0],
-        'keep_dim': keep_dim,
-        'reduce_all': True if dim == None or dim == [] else False,
+        'dim': axis if axis != None and axis != [] and axis != () else [0],
+        'keep_dim': keepdim,
+        'reduce_all': reduce_all_flag
     }
     dtype_flag = False
     if dtype is not None:
         if dtype in ['float64', 'int64']:
-            if (convert_dtype(input.dtype) == "float32" and dtype == "float64") or \
-               (convert_dtype(input.dtype) == "int32" and dtype == "int64"):
+            if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \
+               (convert_dtype(x.dtype) == "int32" and dtype == "int64"):
                 attrs.update({
-                    'in_dtype': input.dtype,
+                    'in_dtype': x.dtype,
                     'out_dtype': convert_np_dtype_to_dtype_(dtype)
                 })
                 dtype_flag = True
@@ -690,27 +827,28 @@ def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
                 format(dtype))
 
     if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] else False
-        dim = dim if dim != None and dim != [] else [0]
+        axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
-            return core.ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                       'reduce_all', reduce_all, 'in_dtype',
-                                       input.dtype, 'out_dtype',
+            return core.ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag, 'in_dtype',
+                                       x.dtype, 'out_dtype',
                                        convert_np_dtype_to_dtype_(dtype))
         else:
-            return core.ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                       'reduce_all', reduce_all)
+            return core.ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag)
     check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_sum')
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
+    check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
+
     helper = LayerHelper('sum', **locals())
     if dtype_flag:
         out = helper.create_variable_for_type_inference(
             dtype=convert_np_dtype_to_dtype_(dtype))
     else:
-        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reduce_sum',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs=attrs)
     return out
@@ -915,7 +1053,7 @@ def mm(input, mat2, name=None):
     return out
 
 
-def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
+def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     """
 	:alias_main: paddle.addmm
 	:alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm
@@ -935,8 +1073,8 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
         input (Variable): The input Tensor/LoDTensor to be added to the final result.
         x (Variable): The first input Tensor/LoDTensor for matrix multiplication.
         y (Variable): The second input Tensor/LoDTensor for matrix multiplication.
-        alpha (float): Coefficient of $x*y$.
         beta (float): Coefficient of $input$.
+        alpha (float): Coefficient of $x*y$.
         name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
@@ -947,25 +1085,43 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
-
-            input = fluid.data(name='input', shape=[2, 2], dtype='float32')
-            x = fluid.data(name='x', shape=[2, 2], dtype='float32')
-            y = fluid.data(name='y', shape=[2, 2], dtype='float32')
-            out = paddle.addmm( input=input, x=x, y=y, alpha=5.0, beta=0.5 )
 
             data_x = np.ones((2, 2)).astype(np.float32)
             data_y = np.ones((2, 2)).astype(np.float32)
             data_input = np.ones((2, 2)).astype(np.float32)
 
-            place =  fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            results = exe.run(fluid.default_main_program(),
-                              fetch_list=[out], feed={"input": data_input, 'x': data_x, "y": data_y})
-            print( np.array(results[0]) )
+            paddle.disable_static()
+
+            x = paddle.to_variable(data_x)
+            y = paddle.to_variable(data_y)
+            input = paddle.to_variable(data_input)
+
+            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+
+            print( out.numpy() )
             # [[10.5 10.5]
             # [10.5 10.5]]
     """
+    input_shape = input.shape
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(input_shape) == len(x_shape) == len(y_shape) == 2:
+        raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape))
+    if input_shape[0] != x_shape[0]:
+        if input_shape[0] != 1:
+            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+        if input_shape[1] != y_shape[1] and input_shape[1] != 1:
+            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+    if input_shape[1] != y_shape[1]:
+        if input_shape[1] != 1:
+            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+        if input_shape[0] != x_shape[0] and input_shape[0] != 1:
+            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+    if x_shape[1] != y_shape[0]:
+        raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape))
+
+
+
     if in_dygraph_mode():
         out = core.ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
         return out
@@ -974,7 +1130,7 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
     attrs = {'Alpha': alpha, 'Beta': beta}
 
     helper = LayerHelper("addmm", **locals())
-    check_variable_and_dtype(x, 'Input', ['float32', 'float64'], 'addmm')
+    check_variable_and_dtype(input, 'Input', ['float32', 'float64'], 'addmm')
     check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm')
     check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -984,81 +1140,83 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
     return out
 
 
-def logsumexp(x, dim=None, keepdim=False, name=None):
+def logsumexp(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.logsumexp
-	:alias: paddle.logsumexp,paddle.tensor.logsumexp,paddle.tensor.math.logsumexp
-
-    This operator calculates the log of the sum of exponentials of the input Tensor.
+    This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
        logsumexp(x) = \log\sum exp(x)
 
-
-    Parameters:
-       x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
-       dim (list|int, optional): The dimensions along which the sum is performed. If :attr:`None`,
-         sum all elements of :attr:`input` and return a Tensor variable with a single element,
-         otherwise must be in the range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
-         the dimension to reduce is :math:`rank + dim[i]`.
-       keep_dim (bool, optional): Whether to reserve the reduced dimension in the output Tensor.
-         The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim`
-         is true, default value is False.
-       name (str, optional): The default value is None.  Normally there is no need for user to
-         set this property.  For more information, please refer to :ref:`api_guide_Name`
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            logsumexp calculations. ``axis`` should be int, list(int) or
+            tuple(int). If ``axis`` is a list/tuple of dimension(s), logsumexp
+            is calculated along all element(s) of ``axis`` . ``axis`` or
+            element(s) of ``axis`` should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is
+            less than 0, it works the same way as :math:`axis + D` . If
+            ``axis`` is None, logsumexp is calculated along all elements of
+            ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keep_dim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       Variable: The calcuated result Tensor/LoDTensor.
+        Tensor, results of logsumexp along ``axis`` of ``x``, with the same data
+        type as ``x``.
 
     Examples:
 
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
-        with fluid.dygraph.guard():
-          np_x = np.random.uniform(0.1, 1, [10]).astype(np.float32)
-          x = fluid.dygraph.to_variable(np_x)
-          print(paddle.logsumexp(x).numpy())
+        paddle.disable_static()
 
-    ..  code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
-
-        with fluid.dygraph.guard():
-            np_x = np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32)
-            x = fluid.dygraph.to_variable(np_x)
-            print(paddle.logsumexp(x, dim=1).numpy())
-            print(paddle.logsumexp(x, dim=[0, 2]).numpy())
+        x = np.array([[-1.5, 0., 2.], [3., 1.2, -2.4]])
+        x = paddle.to_tensor(x)
+        out1 = paddle.logsumexp(x) # [3.4691226]
+        out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
 
     """
-    op_type = 'logsumexp'
-    assert x is not None, 'x cannot be None in {}'.format(op_type)
+    if isinstance(axis, int):
+        axis = [axis]
+    reduce_all = True if axis is None \
+        or len(axis)==0 \
+        or len(axis) == len(x.shape) else False
+    if axis is None or len(axis) == 0:
+        axis = [0]
 
-    # reduce_sum does not support float16
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    if in_dygraph_mode():
+        return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
+                                    'reduce_all', reduce_all)
 
-    exp_out = layers.exp(x)
-    sum_out = layers.reduce_sum(exp_out, dim, keepdim)
+    check_variable_and_dtype(x, 'x',
+                             ['float32', 'float64'],
+                             'logsumexp')
 
-    return layers.log(sum_out, name)
+    helper = LayerHelper('logsumexp', **locals())
+    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    return out
 
 
-def inverse(input, name=None):
+def inverse(x, name=None):
     """
-	:alias_main: paddle.inverse
-	:alias: paddle.inverse,paddle.tensor.inverse,paddle.tensor.math.inverse
-
     Takes the inverse of the square matrix. A square matrix is a matrix with
     the same number of rows and columns. The input can be a square matrix
     (2-D Tensor) or batches of square matrices.
 
     Args:
-        input (Variable): The input Variable which holds a Tensor. The last two
+        x (Variable): The input tensor. The last two
             dimensions should be equal. When the number of dimensions is
             greater than 2, it is treated as batches of square matrix. The data
             type can be float32 and float64.
@@ -1067,201 +1225,227 @@ def inverse(input, name=None):
             please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: A Tensor holds the inverse of input. The shape and data type
-            is the same as input.
+        Variable: A Tensor holds the inverse of x. The shape and data type
+                        is the same as x.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
 
             mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
+            paddle.disable_static()
+            mat = paddle.to_variable(mat_np)
+            inv = paddle.inverse(mat)
+            print(inv) # [[0.5, 0], [0, 0.5]]
 
-            # example for static graph
-            input = fluid.data("input", shape=[2, 2], dtype="float32")
-            out = paddle.inverse(input)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            results = exe.run(feed={"input": mat_np },
-                              fetch_list=[out.name])
-            print(results[0]) # [[0.5, 0], [0, 0.5]]
-
-            # example for dynamic graph
-            with fluid.dygraph.guard():
-                mat = fluid.dygraph.to_variable(mat_np)
-                inv = paddle.inverse(mat)
-                print(inv) # [[0.5, 0], [0, 0.5]]
     """
     if in_dygraph_mode():
-        return core.ops.inverse(input)
+        return core.ops.inverse(x)
 
-    def _check_input(input):
-        check_variable_and_dtype(input, 'input',
+    def _check_input(x):
+        check_variable_and_dtype(x, 'x',
                                  ['float32', 'float64'], 'inverse')
-        if len(input.shape) < 2:
+        if len(x.shape) < 2:
             raise ValueError(
                 "The input of inverse is expected to be a Tensor whose number "
                 "of dimensions is no less than 2. But reviced: %d, "
-                "input's shape: %s." % (len(input.shape), input.shape))
-
-    _check_input(input)
-
+                "x's shape: %s." % (len(x.shape), x.shape))
+    _check_input(x)
     helper = LayerHelper('inverse', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
-        type='inverse', inputs={'Input': [input] }, outputs={'Output': [out]})
+        type='inverse', inputs={'Input': [x] }, outputs={'Output': [out]})
     return out
 
 
-def max(input, dim=None, keep_dim=False, name=None):
+def max(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.max
-	:alias: paddle.max,paddle.tensor.max,paddle.tensor.math.max
 
-    Computes the maximum of tensor elements over the given dimension.
+    Computes the maximum of tensor elements over the given axis.
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
+        x(Tensor): A tensor, the data type is float32,
             float64, int32, int64.
-        dim (list|int, optional): The dimension along which the maximum is computed.
+        axis(list|int, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
+             `x` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
+            If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            than the `x` unless :attr:`keepdim` is true, default
             value is False.
         name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, results of maximum on the specified dim of input tensor,
-        it's data type is the same as input's Tensor.
+        Tensor, results of maximum on the specified axis of input tensor,
+        it's data type is the same as `x`.
 
     Examples:
         .. code-block:: python
+
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
 
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.max(x)  # [0.9]
-            paddle.max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
-            paddle.max(x, dim=-1)  # [0.9, 0.7]
-            paddle.max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            paddle.max(y, dim=[1, 2]) # [4.0, 8.0]
-            paddle.max(y, dim=[0, 1]) # [7.0, 8.0]
+            paddle.disable_static()
+
+            # data_x is a variable with shape [2, 4]
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                               [0.1, 0.2, 0.6, 0.7]])
+            x = paddle.to_variable(data_x)
+            result1 = paddle.max(x)
+            print(result1.numpy())
+            #[0.9]
+            result2 = paddle.max(x, axis=0)
+            print(result2.numpy()) 
+            #[0.2 0.3 0.6 0.9]
+            result3 = paddle.max(x, axis=-1)
+            print(result3.numpy())
+            #[0.9 0.7]
+            result4 = paddle.max(x, axis=1, keepdim=True)
+            print(result4.numpy())
+            #[[0.9]
+            # [0.7]]
+
+            # data_y is a variable with shape [2, 2, 2]
+            # the axis is list 
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_variable(data_y)
+            result5 = paddle.max(y, axis=[1, 2])
+            print(result5.numpy())
+            #[4. 8.]
+            result6 = paddle.max(y, axis=[0, 1])
+            print(result6.numpy())
+            #[7. 8.]
     """
 
-    helper = LayerHelper('max', **locals())
-    out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
-
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'max')
-
-    reduce_all = True if dim == None or dim == [] else False
-    dim = dim if dim != None and dim != [] else [0]
+    if axis is not None and not isinstance(axis, list):
+        if isinstance(axis, tuple):
+            axis = list(axis)
+        elif isinstance(axis, int):
+            axis= [axis]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
 
+    reduce_all = True if axis == None or axis == [] else False
+    axis = axis if axis != None and axis != [] else [0]
     if in_dygraph_mode():
-        return core.ops.reduce_max(input, 'dim', dim, 'keep_dim', keep_dim,
+        return core.ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
+
+    helper = LayerHelper('max', **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'max')
+
+    out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_max',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs={
-            'dim': dim,
-            'keep_dim': keep_dim,
+            'dim': axis,
+            'keep_dim': keepdim,
             'reduce_all': reduce_all
         })
     return out
 
-
-def min(input, dim=None, keep_dim=False, name=None):
+def min(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.min
-	:alias: paddle.min,paddle.tensor.min,paddle.tensor.math.min
 
-    Computes the minimum of tensor elements over the given dimension.
+    Computes the minimum of tensor elements over the given axis
 
     Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
-            float64, int32, int64.
-        dim (list|int, optional): The dimensions along which the minimum is computed.
+        x(Tensor): A tensor, the data type is float32, float64, int32, int64.
+        axis(list|int, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
-            :attr:`input` and return a Tensor variable with a single element,
-            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
+            `x` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
+            If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
+            than the `x` unless :attr:`keepdim` is true, default
             value is False.
         name(str, optional): The default value is None.  Normally there is no need for 
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: Tensor, result of minimum on the specified dim of input tensor,
+        Tensor, results of minimum on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
 
     Examples:
         .. code-block:: python
+
+            import numpy as np
             import paddle
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.min(x)  # [0.1]
-            paddle.min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
-            paddle.min(x, dim=-1)  # [0.2, 0.1]
-            paddle.min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            paddle.min(y, dim=[1, 2]) # [1.0, 5.0]
-            paddle.min(y, dim=[0, 1]) # [1.0, 2.0]
+
+            paddle.disable_static()
+
+            # data_x is a variable with shape [2, 4]
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                            [0.1, 0.2, 0.6, 0.7]])
+            x = paddle.to_variable(data_x)
+            result1 = paddle.min(x)
+            print(result1.numpy())
+            #[0.1]
+            result2 = paddle.min(x, axis=0)
+            print(result2.numpy())
+            #[0.1 0.2 0.5 0.7]
+            result3 = paddle.min(x, axis=-1)
+            print(result3.numpy()) 
+            #[0.2 0.1]
+            result4 = paddle.min(x, axis=1, keepdim=True)
+            print(result4.numpy())
+            #[[0.2]
+            # [0.1]]
+
+            # data_y is a variable with shape [2, 2, 2]
+            # the axis is list 
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_variable(data_y)
+            result5 = paddle.min(y, axis=[1, 2])
+            print(result5.numpy()) 
+            #[1. 5.]
+            result6 = paddle.min(y, axis=[0, 1])
+            print(result6.numpy())
+            #[1. 2.]
     """
 
-    helper = LayerHelper('min', **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
-    if dim is not None and not isinstance(dim, list):
-        dim = [dim]
+    if axis is not None and not isinstance(axis, list):
+        if isinstance(axis, tuple):
+            axis = list(axis)
+        elif isinstance(axis, int):
+            axis= [axis]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
+    reduce_all = True if axis == None or axis == [] else False
+    axis = axis if axis != None and axis != [] else [0]
+    if in_dygraph_mode():
+        return core.ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
+                                   'reduce_all', reduce_all)
 
+    helper = LayerHelper('min', **locals())
     check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'max')
-
-    reduce_all = True if dim == None or dim == [] else False
-    dim = dim if dim != None and dim != [] else [0]
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'min')
 
-    if in_dygraph_mode():
-        return core.ops.reduce_min(input, 'dim', dim, 'keep_dim', keep_dim,
-                                   'reduce_all', reduce_all)
+    out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
     helper.append_op(
         type='reduce_min',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
         attrs={
-            'dim': dim,
-            'keep_dim': keep_dim,
+            'dim': axis,
+            'keep_dim': keepdim,
             'reduce_all': reduce_all
         })
     return out
@@ -1352,14 +1536,14 @@ def addcmul(input, tensor1, tensor2, value=1.0, name=None):
     return out
 
 
-def clamp(input, min=None, max=None, name=None):
+def clip(x, min=None, max=None, name=None):
     """
-	:alias_main: paddle.clamp
-	:alias: paddle.clamp,paddle.tensor.clamp,paddle.tensor.math.clamp
+        :alias_main: paddle.clip
+        :alias: paddle.clip,paddle.tensor.clip,paddle.tensor.math.clip
 
-    **clampe layer**
+    **clip layer**
 
-    This operator clamps all elements in input into the range [ min, max ] and return
+    This operator clip all elements in input into the range [ min, max ] and return
     a resulting tensor as the following equation:
 
     .. math::
@@ -1367,38 +1551,35 @@ def clamp(input, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        input (Variable): An input N-D Tensor or LoDTensor
-            with data type float32, float64.
-        min (float32|Variable): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32 or float64.
+        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Variable): The upper bound with type ``float32`` or a ``Tensor``
+        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor or LodTensor with the same data type and data shape as input's.
+        Tensor: A Tensor with the same data type and data shape as input.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
-            in1 = np.array([[1.2,3.5],
-                            [4.5,6.4]]).astype('float32')
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(in1)
-                out1 = paddle.tensor.clamp(x1, min=3.5, max=5.0)
-                out2 = paddle.tensor.clamp(x1, min=2.5)
-                print(out1.numpy())
-                # [[3.5, 3.5]
-                # [4.5, 5.0]]
-                print(out2.numpy())
-                # [[2.5, 3.5]
-                # [[4.5, 6.4]
+            paddle.disable_static()
+            x = np.array([[1.2,3.5], [4.5,6.4]]).astype('float32')
+            x1 = paddle.to_variable(x)
+            out1 = paddle.clip(x1, min=3.5, max=5.0)
+            out2 = paddle.clip(x1, min=2.5)
+            print(out1.numpy())
+            # [[3.5, 3.5]
+            # [4.5, 5.0]]
+            print(out2.numpy())
+            # [[2.5, 3.5]
+            # [[4.5, 6.4]
     """
 
     assert min is not None or max is not None, "either min or max should be defined."
@@ -1406,20 +1587,22 @@ def clamp(input, min=None, max=None, name=None):
     if in_dygraph_mode():
         min = sys.float_info.min if min is None else min
         max = sys.float_info.max if max is None else max
-        return core.ops.clip(input, "min", min, "max", max)
+        return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
-        check_type(min, 'min', (float, Variable), 'clamp')
+        check_type(min, 'min', (float, int, Variable), 'clip')
         if isinstance(min, Variable):
             check_dtype(min.dtype, 'min', ['float32', 'float64', 'int32'],
-                        'clamp', '(When the type of min in clamp is Variable.)')
+                        'clip', '(When the type of min in clip is Variable.)')
     if max is not None:
-        check_type(max, 'max', (float, Variable), 'clamp')
+        check_type(max, 'max', (float, int, Variable), 'clip')
         if isinstance(max, Variable):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
-                        'clamp', '(When the type of max in clamp is Variable.)')
+                        'clip', '(When the type of max in clip is Variable.)')
 
-    inputs = {'X': input}
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
+
+    inputs = {'X': x}
     attrs = {'min': sys.float_info.min, 'max': sys.float_info.max}
 
     if isinstance(min, Variable):
@@ -1434,9 +1617,9 @@ def clamp(input, min=None, max=None, name=None):
     elif max is not None:
         attrs['max'] = max
 
-    helper = LayerHelper('clamp', **locals())
+    helper = LayerHelper('clip', **locals())
     output = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+        dtype=helper.input_dtype())
     helper.append_op(
         type='clip', inputs=inputs, outputs={'Out': [output]}, attrs=attrs)
 
@@ -1482,11 +1665,11 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
             case2 = np.random.randn(3, 10, 10).astype('float32')
             case3 = np.random.randn(3, 10, 5, 10).astype('float32')
 
-            paddle.enable_imperative()
+            paddle.disable_static()
 
-            case1 = paddle.imperative.to_variable(case1)
-            case2 = paddle.imperative.to_variable(case2)
-            case3 = paddle.imperative.to_variable(case3)
+            case1 = paddle.to_variable(case1)
+            case2 = paddle.to_variable(case2)
+            case3 = paddle.to_variable(case3)
             data1 = paddle.trace(case1) # data1.shape = [1]
             data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
             data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
@@ -1593,3 +1776,322 @@ ${comment}
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="kron", inputs={"X": x, "Y": y}, outputs={"Out": out})
     return out
+
+
+def cumsum(x, axis=None, dtype=None, name=None):
+    """
+    The cumulative sum of the elements along a given axis. The first element of the result is the same of the first element of the input. 
+
+    Args:
+        x (Tensor): Input of cumsum operator, the Tensor needed to be cumsumed. 
+        axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
+        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the result of cumsum operator, output of cumsum operator. 
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            from paddle import to_variable
+            import numpy as np
+
+            paddle.disable_static()
+            data_np = np.arange(12).reshape(3, 4)
+            data = to_variable(data_np)
+
+            y = paddle.cumsum(data)
+            print(y.numpy())
+            # [ 0  1  3  6 10 15 21 28 36 45 55 66]
+
+            y = paddle.cumsum(data, axis=0)
+            print(y.numpy())
+            # [[ 0  1  2  3]
+            #  [ 4  6  8 10]
+            #  [12 15 18 21]]
+            
+            y = paddle.cumsum(data, axis=-1)
+            print(y.numpy())
+            # [[ 0  1  3  6]
+            #  [ 4  9 15 22]
+            #  [ 8 17 27 38]]
+
+            y = paddle.cumsum(data, dtype='float64')
+            print(y.dtype)
+            # VarType.FP64
+    """
+    if axis is None:
+        flatten = True
+    else:
+        flatten = False
+    if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
+        x = layers.cast(x, dtype)
+
+    if in_dygraph_mode():
+        if axis is None:
+            return core.ops.cumsum(x, 'flatten', flatten)
+        else:
+            return core.ops.cumsum(x, 'axis', axis, 'flatten', flatten)
+
+    check_type(x, 'x', (Variable), 'cumsum')
+    locals_var = locals().copy()
+    kwargs = dict()
+    for name, val in locals_var.items():
+        if val is not None:
+            kwargs[name] = val
+    _cum_sum_ = generate_layer_fn('cumsum')
+    return _cum_sum_(**kwargs)
+
+def isfinite(x, name=None):
+    """
+
+    Return whether every element of input tensor is finite number or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is finite number or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isfinite(x)
+            print(out.numpy())  # [False  True  True False  True False False]
+    """
+    if in_dygraph_mode():
+        return core.ops.isfinite_v2(x)
+    helper = LayerHelper("isfinite_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite')
+    out = helper.create_variable_for_type_inference('bool')
+    helper.append_op(type="isfinite_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+def isinf(x, name=None):
+    """
+
+    Return whether every element of input tensor is `+/-INF` or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is `+/-INF` or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isinf(x)
+            print(out.numpy())  # [ True False False  True False False False]
+    """
+    if in_dygraph_mode():
+        return core.ops.isinf_v2(x)
+    helper = LayerHelper("isinf_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf')
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    helper.append_op(type="isinf_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+def isnan(x, name=None):
+    """
+
+    Return whether every element of input tensor is `NaN` or not.
+
+    Args:
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        `Tensor`, the bool result which shows every element of `x` whether it is `NaN` or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.disable_static()
+            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
+            x = paddle.to_tensor(x_np)
+            out = paddle.tensor.isnan(x)
+            print(out.numpy())  # [False False False False False  True  True]
+    """
+    if in_dygraph_mode():
+        return core.ops.isnan_v2(x)
+    helper = LayerHelper("isnan_v2", **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan')
+    out = helper.create_variable_for_type_inference(dtype='bool')
+    helper.append_op(type="isnan_v2", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def prod(x, axis=None, keepdim=False, dtype=None, name=None):
+    """
+    Compute the product of tensor elements over the given axis.
+
+    Args:
+        x(Tensor): The input tensor, its data type should be float32, float64, int32, int64.
+        axis(int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`, 
+            multiply all elements of `x` and return a Tensor with a single element, 
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`, 
+            the axis to reduce is :math:`x.ndim + axis[i]`. Default is None.
+        dtype(str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
+            int32, int64. If specified, the input tensor is casted to dtype before operator performed. 
+            This is very useful for avoiding data type overflows. The default value is None, the dtype 
+            of output is the same as input Tensor `x`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
+            tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
+        name(string, optional): The default value is None. Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor, result of product on the specified dim of input tensor.
+
+    Raises:
+        ValueError: The :attr:`dtype` must be float32, float64, int32 or int64.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # the axis is a int element
+            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
+                         [0.1, 0.2, 0.6, 0.7]]).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            out1 = paddle.prod(x)
+            print(out1.numpy())
+            # [0.0002268]
+
+            out2 = paddle.prod(x, -1)
+            print(out2.numpy())
+            # [0.027  0.0084]
+
+            out3 = paddle.prod(x, 0)
+            print(out3.numpy())
+            # [0.02 0.06 0.3  0.63]
+            print(out3.numpy().dtype)
+            # float32
+
+            out4 = paddle.prod(x, 0, keepdim=True)
+            print(out4.numpy())
+            # [[0.02 0.06 0.3  0.63]]
+
+            out5 = paddle.prod(x, 0, dtype='int64')
+            print(out5.numpy())
+            # [0 0 0 0]
+            print(out5.numpy().dtype)
+            # int64
+
+            # the axis is list
+            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
+                               [[5.0, 6.0], [7.0, 8.0]]])
+            y = paddle.to_tensor(data_y)
+            out6 = paddle.prod(y, [0, 1])
+            print(out6.numpy())
+            # [105. 384.]
+
+            out7 = paddle.prod(y, (1, 2))
+            print(out7.numpy())
+            # [  24. 1680.]
+
+    """
+    if dtype is not None:
+        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'prod')
+        if x.dtype != convert_np_dtype_to_dtype_(dtype):
+            x = layers.cast(x, dtype)
+
+    return layers.reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
+
+
+def sign(x, name=None):
+    """
+    This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
+
+    Args:
+        x(Tensor): The input tensor. The data type can be float16, float32 or float64.
+        name (str, optional): The default value is None. Normally there is no need for user to
+            set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: The output sign tensor with identical shape and data type to the input :attr:`x`.
+
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+
+          data = np.array([3.0, 0.0, -2.0, 1.7], dtype='float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(data)
+          out = paddle.sign(x=x)
+          print(out)  # [1.0, 0.0, -1.0, 1.0]
+    """
+    if in_dygraph_mode():
+        return core.ops.sign(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sign')
+    helper = LayerHelper("sign", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(type='sign', inputs={'X': [x]}, outputs={'Out': [out]})
+
+    return out
+
+
+def tanh(x, name=None):
+    """
+    Tanh Activation Operator.
+
+    .. math::
+        out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+
+    Args:
+        x (Tensor): Input of Tanh operator, an N-D Tensor, with data type float32, float64 or float16.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Output of Tanh operator, a Tensor with same data type and shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+            x = paddle.to_tensor(x_data)
+            out = paddle.tanh(x)
+            print(out.numpy())
+            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+    """
+    if in_dygraph_mode():
+        return core.ops.tanh(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tanh')
+    check_type(x, 'x', (Variable), 'tanh')
+    helper = LayerHelper('tanh', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='tanh', inputs={'X': x}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 5e9f55cd34c3e3e5cedee10352c7a5d96fbb8abc..c652d0f1891c8bd0a4c85ea777527a2fd82ad11b 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,22 +21,411 @@ from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, V
 from ..fluid.layers.layer_function_generator import templatedoc
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ..fluid.layers import utils, uniform_random, gaussian_random
+from ..fluid.layers import utils
 from ..fluid.layers.tensor import fill_constant
+import paddle
+import warnings
 
 from ..fluid.io import shuffle  #DEFINE_ALIAS
 
 __all__ = [
-    #       'gaussin',
-    #       'uniform',
+    'bernoulli',
+    'standard_normal',
+    'normal',
+    'uniform',
     'shuffle',
     'randn',
     'rand',
     'randint',
-    'randperm'
+    'randperm',
 ]
 
 
+def bernoulli(x, name=None):
+    """
+
+    This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+    The input ``x`` is a tensor with probabilities for generating the random binary number.
+    Each element in ``x`` should be in [0, 1], and the out is generated by:
+    
+    .. math::
+
+        out_i ~ Bernoulli (x_i)
+
+    Args:
+        x(Tensor):  A tensor with probabilities for generating the random binary number. The data type 
+            should be float32, float64.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns: 
+        Tensor: A Tensor filled with random binary number with the same shape and dtype as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        paddle.disable_static()
+
+        x = paddle.rand([2, 3])
+        print(x.numpy())
+        # [[0.11272584 0.3890902  0.7730957 ]
+        # [0.10351662 0.8510418  0.63806665]]
+
+        out = paddle.bernoulli(x)
+        print(out.numpy())
+        # [[0. 0. 1.]
+        # [0. 0. 1.]]
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.bernoulli(x)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
+
+    helper = LayerHelper("randint", **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype)  # maybe set out to int32 ? 
+    helper.append_op(
+        type='bernoulli', inputs={"X": x}, outputs={'Out': out}, attrs={})
+    return out
+
+
+def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a Gaussian
+    distribution, with ``shape`` and ``dtype``.
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        mean(float|int, optional): Mean of the output tensor, default is 0.0.
+        std(float|int, optional): Standard deviation of the output tensor, default
+            is 1.0.
+        seed(int, optional): ${seed_comment}
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+            the output Tensor. Supported data types: float32, float64.
+            Default is float32.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a Gaussian
+        distribution, with ``shape`` and ``dtype``. 
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    seed = 0
+    op_type_for_check = 'gaussian_random/standard_normal/randn/normal'
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.gaussian_random('shape', shape, 'mean',
+                                        float(mean), 'std',
+                                        float(std), 'seed', seed, 'dtype',
+                                        dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), op_type_for_check)
+    check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
+
+    inputs = {}
+    attrs = {
+        'mean': mean,
+        'std': std,
+        'seed': seed,
+        'dtype': dtype,
+        'use_mkldnn': False
+    }
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
+
+    helper = LayerHelper('gaussian_random', **locals())
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='gaussian_random',
+        inputs=inputs,
+        outputs={'Out': out},
+        attrs=attrs)
+    out.stop_gradient = True
+    return out
+
+
+def standard_normal(shape, dtype=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a standard
+    normal distribution with mean 0 and standard deviation 1, with ``shape``
+    and ``dtype``.
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+            output tensor. Supported data types: float32, float64. If ``dytpe``
+            is None, the data type is float32. Default is None.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a standard
+        normal distribution with mean 0 and standard deviation 1, with
+        ``shape`` and ``dtype``.
+
+    Raises:
+        TypeError: If ``shape`` is not list, tuple, Tensor.
+        TypeError: If ``dtype`` is not float32, float64.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.standard_normal(shape=[2, 3])
+            # [[-2.923464  ,  0.11934398, -0.51249987],  # random
+            #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.standard_normal(shape=[dim_1, dim_2, 2])
+            # [[[-2.8852394 , -0.25898588],  # random
+            #   [-0.47420555,  0.17683524],  # random
+            #   [-0.7989969 ,  0.00754541]],  # random
+            #  [[ 0.85201347,  0.32320443],  # random
+            #   [ 1.1399018 ,  0.48336947],  # random
+            #   [ 0.8086993 ,  0.6868893 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            var_shape = paddle.to_tensor(np.array([2, 3]))
+            result_3 = paddle.standard_normal(var_shape)
+            # [[-2.878077 ,  0.17099959,  0.05111201]  # random
+            #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
+
+    """
+    if dtype is None:
+        dtype = 'float32'
+
+    return gaussian_random(
+        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
+
+
+randn = standard_normal
+
+
+def normal(mean=0.0, std=1.0, shape=None, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a normal
+    distribution with ``mean`` and ``std`` (standard deviation) .
+
+    If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``.
+    If ``mean`` is not a Tensor and ``std`` is a Tensor, the output Tensor has the same shape and data type as ``std``.
+    If ``mean`` and ``std`` are not a Tensor, the output Tensor has the same shape as ``shape``, with data type float32.
+
+    If ``mean`` and ``std`` are Tensor, the num of elements of ``mean`` and ``std`` should be the same.
+
+    Args:
+        mean (float|Tensor, optional): The mean of the output Tensor's normal distribution.
+            If ``mean`` is float, all elements of the output Tensor shared the same mean.
+            If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means.
+            Default is 0.0
+        std (float|Tensor, optional): The  standard deviation of the output Tensor's normal distribution.
+            If ``std`` is float, all elements of the output Tensor shared the same standard deviation.
+            If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations.
+            Defaule is 1.0
+        shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64). If ``mean`` or ``std`` is a Tensor, the shape of the output
+            Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored.
+            Default is None
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            out1 = paddle.normal(shape=[2, 3])
+            # [[ 0.17501129  0.32364586  1.561118  ]  # random
+            #  [-1.7232178   1.1545963  -0.76156676]]  # random
+
+            mean_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            out2 = paddle.normal(mean=mean_tensor)
+            # [ 0.18644847 -1.19434458  3.93694787]  # random
+
+            std_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
+            # [1.00780561 3.78457445 5.81058198]  # random
+
+    """
+    if not in_dygraph_mode():
+        check_type(mean, 'mean', (int, float, Variable), 'normal')
+        check_type(std, 'std', (int, float, Variable), 'normal')
+        if isinstance(mean, Variable):
+            check_dtype(
+                mean.dtype, 'mean', ['float32', 'float64'], 'normal',
+                "If mean is Tensor, it's data type only support float32, float64."
+            )
+        if isinstance(std, Variable):
+            check_dtype(
+                std.dtype, 'std', ['float32', 'float64'], 'normal',
+                "If std is Tensor, it's data type only support float32, float64."
+            )
+        if shape is not None:
+            if isinstance(shape, (list, tuple)):
+                for item in shape:
+                    check_type(item, 'shape', (int), 'normal',
+                               'Elements of shape should be int.')
+            elif isinstance(shape, Variable):
+                check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'normal')
+            else:
+                assert TypeError(
+                    'If mean and std are all not Tensor, shape should be list, tuple, Tensor.'
+                )
+
+    if isinstance(mean, Variable):
+        if isinstance(std, Variable):
+            if std.dtype != mean.dtype:
+                std = paddle.cast(std, mean.dtype)
+            mean_shape = paddle.shape(mean)
+            std = paddle.reshape(std, mean_shape)
+        else:
+            std = float(std)
+        out = standard_normal(paddle.shape(mean), mean.dtype, name)
+    elif isinstance(std, Variable):
+        mean = float(mean)
+        out = standard_normal(paddle.shape(std), std.dtype, name)
+    else:
+        return gaussian_random(shape=shape, mean=mean, std=std, name=name)
+
+    out = out * std + mean
+    if not in_dygraph_mode():
+        out.stop_grediant = True
+    return out
+
+
+def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
+    """
+    This OP returns a Tensor filled with random values sampled from a uniform
+    distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
+
+    Examples:
+    ::
+        Input:
+          shape = [1, 2]
+        Output:
+          result=[[0.8505902, 0.8397286]]
+
+    Args:
+        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+            is a list or tuple, the elements of it should be integers or Tensors
+            (with the shape [1], and the data type int32 or int64). If ``shape``
+            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
+            int64).
+        dtype(str|np.dtype, optional): The data type of
+            the output Tensor. Supported data types: float32, float64.
+            Default is float32.
+        min(float|int, optional): The lower bound on the range of random values
+            to generate, ``min`` is included in the range. Default is -1.0.
+        max(float|int, optional): The upper bound on the range of random values
+            to generate, ``max`` is excluded in the range. Default is 1.0.
+        seed(int, optional): Random seed used for generating samples. 0 means
+            use a seed generated by the system. Note that if seed is not 0,
+            this operator will always generate the same random numbers every
+            time. Default is 0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A Tensor filled with random values sampled from a uniform
+        distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
+
+    Raises:
+        TypeError: If ``shape`` is not list, tuple, Tensor.
+        TypeError: If ``dtype`` is not float32, float64.
+
+    Examples:
+        .. code-block:: python
+            
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+
+            # example 1:
+            # attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.tensor.random.uniform(shape=[3, 4])
+            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357],
+            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249],
+            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]]
+
+            # example 2:
+            # attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.tensor.random.uniform(shape=[dim_1, dim_2])
+            # [[-0.9951253,   0.30757582, 0.9899647 ],
+            #  [ 0.5864527,   0.6607096,  -0.8886161 ]]
+
+            # example 3:
+            # attr shape is a Tensor, the data type must be int64 or int32.
+            shape = np.array([2, 3])
+            shape_tensor = paddle.to_tensor(shape)
+            result_3 = paddle.tensor.random.uniform(shape_tensor)
+            # if shape_tensor's value is [2, 3]
+            # result_3 is:
+            # [[-0.8517412,  -0.4006908,   0.2551912 ],
+            #  [ 0.3364414,   0.36278176, -0.16085452]]
+
+
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.uniform_random('shape', shape, 'min',
+                                       float(min), 'max',
+                                       float(max), 'seed', seed, 'dtype', dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+
+    inputs = dict()
+    attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+
+    helper = LayerHelper("uniform_random", **locals())
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="uniform_random", inputs=inputs, attrs=attrs,
+        outputs={"Out": out})
+    return out
+
+
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
 	:alias_main: paddle.randint
@@ -78,40 +467,40 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-
-        # example 1:
-        # attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.randint(low=-5, high=5, shape=[3])
-        # [0, -3, 2]
-
-        # example 2:
-        # attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
-        # [[0, -1, -3],
-        #  [4, -2,  0]]
-
-        # example 3:
-        # attr shape is a Tensor
-        var_shape = paddle.imperative.to_variable(np.array([3]))
-        result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
-        # [-2, 2, 3]
-
-        # example 4:
-        # data type is int32
-        result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
-        # [-5, 4, -4]
-
-        # example 5:
-        # Input only one parameter
-        # low=0, high=10, shape=[1], dtype='int64'
-        result_5 = paddle.randint(10)
-        # [7]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+
+            # example 1:
+            # attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.randint(low=-5, high=5, shape=[3])
+            # [0, -3, 2]  # random
+
+            # example 2:
+            # attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+            # [[0, -1, -3],  # random
+            #  [4, -2,  0]]  # random
+
+            # example 3:
+            # attr shape is a Tensor
+            var_shape = paddle.to_variable(np.array([3]))
+            result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+            # [-2, 2, 3]  # random
+
+            # example 4:
+            # data type is int32
+            result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            # [-5, 4, -4]  # random
+
+            # example 5:
+            # Input only one parameter
+            # low=0, high=10, shape=[1], dtype='int64'
+            result_5 = paddle.randint(10)
+            # [7]  # random
 
     """
     if high is None:
@@ -150,77 +539,6 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
-def randn(shape, dtype=None, name=None):
-    """
-	:alias_main: paddle.randn
-	:alias: paddle.tensor.randn, paddle.tensor.random.randn
-
-    This OP returns a Tensor filled with random values sampled from a normal
-    distribution with mean 0 and standard deviation 1 (also called the standard
-    normal distribution), with ``shape`` and ``dtype``.
-
-    Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
-            is a list or tuple, the elements of it should be integers or Tensors
-            (with the shape [1], and the data type int32 or int64). If ``shape``
-            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
-            int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: A Tensor filled with random values sampled from a normal
-        distribution with mean 0 and standard deviation 1 (also called the
-        standard normal distribution), with ``shape`` and ``dtype``.
-
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-
-        # example 1: attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.randn(shape=[2, 3])
-        # [[-2.923464  ,  0.11934398, -0.51249987],
-        #  [ 0.39632758,  0.08177969,  0.2692008 ]]
-
-        # example 2: attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.randn(shape=[dim_1, dim_2, 2])
-        # [[[-2.8852394 , -0.25898588],
-        #   [-0.47420555,  0.17683524],
-        #   [-0.7989969 ,  0.00754541]],
-        #  [[ 0.85201347,  0.32320443],
-        #   [ 1.1399018 ,  0.48336947],
-        #   [ 0.8086993 ,  0.6868893 ]]]
-
-        # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-        var_shape = paddle.imperative.to_variable(np.array([2, 3]))
-        result_3 = paddle.randn(var_shape)
-        # [[-2.878077 ,  0.17099959,  0.05111201]
-        #  [-0.3761474, -1.044801  ,  1.1870178 ]]
-
-    """
-    if dtype is None:
-        dtype = 'float32'
-
-    out = gaussian_random(
-        shape=shape, mean=0.0, std=1.0, seed=0, dtype=dtype, name=name)
-    out.stop_gradient = True
-    return out
-
-
 @templatedoc()
 def randperm(n, dtype="int64", name=None):
     """
@@ -250,15 +568,15 @@ def randperm(n, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
+            import paddle
 
-        paddle.enable_imperative()
+            paddle.disable_static()
 
-        result_1 = paddle.randperm(5)
-        # [4, 1, 2, 3, 0]
+            result_1 = paddle.randperm(5)
+            # [4, 1, 2, 3, 0]  # random
 
-        result_2 = paddle.randperm(7, 'int32')
-        # [1, 6, 2, 0, 4, 3, 5]
+            result_2 = paddle.randperm(7, 'int32')
+            # [1, 6, 2, 0, 4, 3, 5]  # random
  
     """
     if not isinstance(dtype, core.VarDesc.VarType):
@@ -322,36 +640,36 @@ def rand(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-        # example 1: attr shape is a list which doesn't contain Tensor.
-        result_1 = paddle.rand(shape=[2, 3])
-        # [[0.451152  , 0.55825245, 0.403311  ],
-        #  [0.22550228, 0.22106001, 0.7877319 ]]
-
-        # example 2: attr shape is a list which contains Tensor.
-        dim_1 = paddle.fill_constant([1], "int64", 2)
-        dim_2 = paddle.fill_constant([1], "int32", 3)
-        result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
-        # [[[0.8879919 , 0.25788337],
-        #   [0.28826773, 0.9712097 ],
-        #   [0.26438272, 0.01796806]],
-        #  [[0.33633623, 0.28654453],
-        #   [0.79109055, 0.7305809 ],
-        #   [0.870881  , 0.2984597 ]]]
-
-        # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-        var_shape = paddle.imperative.to_variable(np.array([2, 3]))
-        result_3 = paddle.rand(var_shape)
-        # [[0.22920267, 0.841956  , 0.05981819],
-        #  [0.4836288 , 0.24573246, 0.7516129 ]]
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            # example 1: attr shape is a list which doesn't contain Tensor.
+            result_1 = paddle.rand(shape=[2, 3])
+            # [[0.451152  , 0.55825245, 0.403311  ],  # random
+            #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
+
+            # example 2: attr shape is a list which contains Tensor.
+            dim_1 = paddle.fill_constant([1], "int64", 2)
+            dim_2 = paddle.fill_constant([1], "int32", 3)
+            result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+            # [[[0.8879919 , 0.25788337],  # random
+            #   [0.28826773, 0.9712097 ],  # random
+            #   [0.26438272, 0.01796806]],  # random
+            #  [[0.33633623, 0.28654453],  # random
+            #   [0.79109055, 0.7305809 ],  # random
+            #   [0.870881  , 0.2984597 ]]]  # random
+
+            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            var_shape = paddle.to_variable(np.array([2, 3]))
+            result_3 = paddle.rand(var_shape)
+            # [[0.22920267, 0.841956  , 0.05981819],  # random
+            #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
     """
     if dtype is None:
         dtype = 'float32'
 
-    out = uniform_random(shape, dtype, min=0.0, max=1.0, name=name)
+    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
     out.stop_gradient = True
     return out
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 1cb775c9d4b73beaf0f2167fe7fc9909e91d116d..eede022e05ba61bc23da517e7af7cd2eb58f5416 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -21,7 +21,6 @@ from ..fluid import core, layers
 from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
-from ..fluid.layers import topk  #DEFINE_ALIAS
 
 __all__ = [
     'argmax',
@@ -29,13 +28,13 @@ __all__ = [
     'argsort',
     'has_inf',
     'has_nan',
-    #       'masked_select',
+    'masked_select',
     'topk',
     'where',
     'index_select',
     'nonzero',
     'sort',
-    'index_sample'
+    'index_sample',
 ]
 
 from paddle.common_ops_import import *
@@ -68,17 +67,16 @@ def argsort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.imperative as imperative 
             import numpy as np
             
-            paddle.enable_imperative()
+            paddle.disable_static()
             input_array = np.array([[[5,8,9,5],
                             [0,0,1,7],
                             [6,9,2,4]],
                             [[5,2,4,2],
                             [4,7,7,9],
                             [1,7,0,6]]]).astype(np.float32)
-            x = imperative.to_variable(input_array)
+            x = paddle.to_variable(input_array)
             out1 = paddle.argsort(x=x, axis=-1)
             out2 = paddle.argsort(x=x, axis=0)
             out3 = paddle.argsort(x=x, axis=1)
@@ -126,95 +124,168 @@ def argsort(x, axis=-1, descending=False, name=None):
     return ids
 
 
-def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
+def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.argmax
-	:alias: paddle.argmax,paddle.tensor.argmax,paddle.tensor.search.argmax
-
     This OP computes the indices of the max elements of the input tensor's
     element along the provided axis.
 
     Args:
-        input(Variable): An input N-D Tensor with type float32, float64, int16,
+        x(Tensor): An input N-D Tensor with type float32, float64, int16,
             int32, int64, uint8.
         axis(int, optional): Axis to compute indices along. The effective range
-            is [-R, R), where R is Rank(input). when axis<0, it works the same way
-            as axis+R. Default is None, it will use the last dim to select indices of max value.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor which can
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        dtype(str): Data type of the output tensor which can
                     be int32, int64. The default value is None, and it will
                     return the int64 indices.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result. Defalut is None.
-        keepdims(bool, optional): Keep the axis that do the select max.
+        keepdim(bool, optional): Keep the axis that selecting max. The defalut value is False.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with data type int64.
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
             import numpy as np
+            import paddle
 
-            in1 = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]])
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(in1)
-                out1 = paddle.argmax(input=x, axis=-1)
-                out2 = paddle.argmax(input=x, axis=0)
-                out3 = paddle.argmax(input=x, axis=1)
-                out4 = paddle.argmax(input=x, axis=2)
-                out5 = paddle.argmax(input=x, axis=2, keepdims=True)
-                print(out1.numpy())
-                # [[2 3 1]
-                #  [0 3 1]]
-                print(out2.numpy())
-                # [[0 0 0 0]
-                #  [1 1 1 1]
-                #  [0 0 0 1]]
-                print(out3.numpy())
-                # [[2 2 0 1]
-                #  [0 1 1 1]]
-                print(out4.numpy())
-                # [[2 3 1]
-                #  [0 3 1]]
-                print(out5.numpy())
-                #array([[[2],
-                #        [3],
-                #        [1]],
-                #       [[0],
-                #        [3],
-                #        [1]]])
+            paddle.disable_static()
+            data = np.array([[5,8,9,5],
+                             [0,0,1,7],
+                             [6,9,2,4]])
+            x =  paddle.to_variable(data)
+            out1 = paddle.argmax(x)
+            print(out1.numpy()) # 2
+            out2 = paddle.argmax(x, axis=1)
+            print(out2.numpy()) 
+            # [2 3 1]
+            out3 = paddle.argmax(x, axis=-1)
+            print(out3.numpy()) 
+            # [2 3 1]
     """
-    helper = LayerHelper("arg_max", **locals())
+    flatten = False
+    if axis is None:
+        flatten = True
+        axis = 0
+
+    if in_dygraph_mode():
+        if dtype != None:
+            var_dtype = convert_np_dtype_to_dtype_(dtype)
+            out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype,
+                                   'keepdim', keepdim, 'flatten', flatten)
+        else:
+            out = core.ops.arg_max(x, 'axis', axis, 'keepdim', keepdim,
+                                   'flatten', flatten)
+        return out
+
+    helper = LayerHelper("argmax", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
+        'paddle.argmax')
     var_dtype = None
     attrs = {}
     if dtype is not None:
-        check_dtype(dtype, 'create data type', ['int32', 'int64'], 'arg_max')
+        if dtype not in ['int32', 'int64']:
+            raise ValueError(
+                "The value of 'dtype' in argmax op must be int32, int64, but received of {}".
+                format(dtype))
         var_dtype = convert_np_dtype_to_dtype_(dtype)
         attrs["dtype"] = var_dtype
     else:
         var_dtype = VarDesc.VarType.INT64
-    if out is None:
-        out = helper.create_variable_for_type_inference(var_dtype)
+
+    out = helper.create_variable_for_type_inference(var_dtype)
+    attrs['keepdims'] = keepdim
+    attrs['axis'] = axis
+    attrs['flatten'] = flatten
+    helper.append_op(
+        type='arg_max', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
+    out.stop_gradient = True
+    return out
+
+
+def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
+    """
+    This OP computes the indices of the min elements of the input tensor's
+    element along the provided axis.
+
+    Args:
+        x(Tensor): An input N-D Tensor with type float32, float64, int16,
+            int32, int64, uint8.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        dtype(str): Data type of the output tensor which can
+                    be int32, int64. The default value is None, and it will
+                    return the int64 indices.
+        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            data = np.array([[5,8,9,5],
+                             [0,0,1,7],
+                             [6,9,2,4]])
+            x =  paddle.to_variable(data)
+            out1 = paddle.argmin(x)
+            print(out1.numpy()) # 4
+            out2 = paddle.argmin(x, axis=1)
+            print(out2.numpy()) 
+            # [0 0 2]
+            out3 = paddle.argmin(x, axis=-1)
+            print(out3.numpy()) 
+            # [0 0 2]
+    """
+    flatten = False
     if axis is None:
-        axis = -1
-    attrs['keepdims'] = keepdims
+        flatten = True
+        axis = 0
+
+    if in_dygraph_mode():
+        if dtype != None:
+            var_dtype = convert_np_dtype_to_dtype_(dtype)
+            out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype,
+                                   'keepdim', keepdim, 'flatten', flatten)
+        else:
+            out = core.ops.arg_min(x, 'axis', axis, 'keepdim', keepdim,
+                                   'flatten', flatten)
+        return out
+
+    helper = LayerHelper("argmin", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
+        'paddle.argmin')
+    var_dtype = None
+    attrs = {}
+    if dtype is not None:
+        if dtype not in ['int32', 'int64']:
+            raise ValueError(
+                "The value of 'dtype' in argmin op must be int32, int64, but received of {}".
+                format(dtype))
+        var_dtype = convert_np_dtype_to_dtype_(dtype)
+        attrs["dtype"] = var_dtype
+    else:
+        var_dtype = VarDesc.VarType.INT64
+
+    out = helper.create_variable_for_type_inference(var_dtype)
+    attrs['keepdims'] = keepdim
     attrs['axis'] = axis
+    attrs['flatten'] = flatten
     helper.append_op(
-        type='arg_max',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs=attrs)
+        type='arg_min', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -250,14 +321,14 @@ def index_select(x, index, axis=0, name=None):
             import paddle
             import numpy as np
 
-            paddle.enable_imperative()  # Now we are in imperative mode
+            paddle.disable_static()  # Now we are in imperative mode
             data = np.array([[1.0, 2.0, 3.0, 4.0],
                              [5.0, 6.0, 7.0, 8.0],
                              [9.0, 10.0, 11.0, 12.0]])
             data_index = np.array([0, 1, 1]).astype('int32')
 
-            x = paddle.imperative.to_variable(data)
-            index = paddle.imperative.to_variable(data_index)
+            x = paddle.to_tensor(data)
+            index = paddle.to_tensor(data_index)
             out_z1 = paddle.index_select(x=x, index=index)
             #[[1. 2. 3. 4.]
             # [5. 6. 7. 8.]
@@ -399,17 +470,16 @@ def sort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.imperative as imperative 
             import numpy as np
             
-            paddle.enable_imperative()
+            paddle.disable_static()
             input_array = np.array([[[5,8,9,5],
                             [0,0,1,7],
                             [6,9,2,4]],
                             [[5,2,4,2],
                             [4,7,7,9],
                             [1,7,0,6]]]).astype(np.float32)
-            x = imperative.to_variable(input_array)
+            x = paddle.to_variable(input_array)
             out1 = paddle.sort(x=x, axis=-1)
             out2 = paddle.sort(x=x, axis=0)
             out3 = paddle.sort(x=x, axis=1)
@@ -631,3 +701,154 @@ def index_sample(x, index):
                 'Index': index},
         outputs={'Out': out})
     return out
+
+
+def masked_select(x, mask, name=None):
+    """
+    This OP Returns a new 1-D tensor which indexes the input tensor according to the ``mask``
+    which is a tensor with data type of bool.
+
+    Args:
+        x (Tensor): The input Tensor, the data type can be int32, int64, float32, float64. 
+        mask (Tensor): The Tensor containing the binary mask to index with, it's data type is bool.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns: A 1-D Tensor which is the same data type  as ``x``.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
+        TypeError: ``mask`` must be a Tensor and the data type of ``mask`` must be bool.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            data = np.array([[1.0, 2.0, 3.0, 4.0],
+                                [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
+            
+            mask_data = np.array([[True, False, False, False],
+                            [True, True, False, False],
+                            [True, False, False, False]]).astype('bool')
+            x = paddle.to_tensor(data)
+            mask = paddle.to_tensor(mask_data)
+            out = paddle.masked_select(x, mask)
+            #[1.0 5.0 6.0 9.0]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.masked_select(x, mask)
+
+    helper = LayerHelper("masked_select", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'paddle.tensor.search.mask_select')
+    check_variable_and_dtype(mask, 'mask', ['bool'],
+                             'paddle.tensor.search.masked_select')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='masked_select', inputs={'X': x,
+                                      'Mask': mask}, outputs={'Y': out})
+    return out
+
+
+def topk(x, k, axis=None, largest=True, sorted=True, name=None):
+    """
+    This OP is used to find values and indices of the k largest or smallest at the optional axis.
+    If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
+    If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
+
+    Args:
+        x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
+        k(int, Tensor): The number of top elements to look for along the axis.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is -1.
+        largest(bool, optional) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default is True.
+        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+
+    Examples:
+
+        .. code-block:: python
+
+           import numpy as np
+           import paddle
+
+           paddle.disable_static()
+
+           data_1 = np.array([1, 4, 5, 7])
+           tensor_1 = paddle.to_tensor(data_1)
+           value_1, indices_1 = paddle.topk(tensor_1, k=1)
+           print(value_1.numpy())
+           # [7]
+           print(indices_1.numpy())
+           # [3] 
+           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
+           tensor_2 = paddle.to_tensor(data_2)
+           value_2, indices_2 = paddle.topk(tensor_2, k=1)
+           print(value_2.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_2.numpy())
+           # [[3]
+           #  [1]]
+           value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
+           print(value_3.numpy())
+           # [[7]
+           #  [6]]
+           print(indices_3.numpy())
+           # [[3]
+           #  [1]]
+           value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
+           print(value_4.numpy())
+           # [[2 6 5 7]]
+           print(indices_4.numpy())
+           # [[1 1 0 0]]
+
+    """
+    if in_dygraph_mode():
+        k = k.numpy().item(0) if isinstance(k, Variable) else k
+        if axis is None:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'largest', largest,
+                                             'sorted', sorted)
+        else:
+            out, indices = core.ops.top_k_v2(x, 'k',
+                                             int(k), 'axis', axis, 'largest',
+                                             largest, 'sorted', sorted)
+        return out, indices
+
+    helper = LayerHelper("top_k_v2", **locals())
+    inputs = {"X": [x]}
+    attrs = {}
+    if isinstance(k, Variable):
+        inputs['K'] = [k]
+    else:
+        attrs = {'k': k}
+    attrs['largest'] = largest
+    attrs['sorted'] = sorted
+    if axis is not None:
+        attrs['axis'] = axis
+
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 9b3bb081d9776a7ef88245f76d564c7b107ca669..91676a6316b81a1998b9b48fb9ea7fcba6d67c25 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -13,152 +13,253 @@
 # limitations under the License.
 
 # TODO: define statistical functions of a tensor  
-from ..fluid.layers import mean  #DEFINE_ALIAS
 from ..fluid.layers import reduce_mean  #DEFINE_ALIAS
 
-__all__ = ['mean', 'reduce_mean', 'std', 'var']
+__all__ = ['mean', 'reduce_mean', 'std', 'var', 'numel']
 
 import numpy as np
+from ..fluid.framework import Variable
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import in_dygraph_mode
+from ..fluid.framework import core, in_dygraph_mode
 from ..fluid import layers
 from .search import where
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+import paddle
 
 
-def var(input, axis=None, keepdim=False, unbiased=True, out=None, name=None):
+def mean(x, axis=None, keepdim=False, name=None):
     """
-	:alias_main: paddle.var
-	:alias: paddle.var,paddle.tensor.var,paddle.tensor.stat.var
-
-    Computes the variance of the input Variable's elements along the specified 
-    axis.
+    Computes the mean of the input tensor's elements along ``axis``.
 
     Args:
-        input (Variable): The input Variable to be computed variance, with data 
-            type float32 and float64 supported.
-        axis (list|int, optional): The axis along which the variance is computed. 
-            If `None`, compute the variance over all elements of :attr:`input`
-            and return a Variable with a single element, otherwise it must be in 
-            the range :math:`[-rank(input), rank(input))`. If :math:`axis[i] < 0`, 
-            the axis to compute is :math:`rank(input) + axis[i]`.
-        keepdim (bool, optional): Whether to reserve the reduced dimensions in 
-            the output Variable. The dimensions in :attr:`axis` will be squeezed 
-            and the result Variable will have :attr:`len(axis)` fewer dimensions 
-            than the :attr:`input` unless :attr:`keepdim` is true, default False.
-        unbiased (bool, optional): Whether to compute variance via the unbiased 
-            estimator, in which the divisor used in the computation is 
-            :math:`N - 1`, where :math:`N` represents the number of elements 
-            along :attr:`axis`, otherwise the divisor is :math:`N`. Default True.
-        out (Variable, optional): Alternate output Variable to store the result
-            variance. Default None.
-        name (str, optional): The name for this layer. Normally there is no 
-            need for user to set this property.  For more information, please 
-            refer to :ref:`api_guide_Name`. Default None.
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform mean
+            calculations. ``axis`` should be int, list(int) or tuple(int). If
+            ``axis`` is a list/tuple of dimension(s), mean is calculated along
+            all element(s) of ``axis`` . ``axis`` or element(s) of ``axis``
+            should be in range [-D, D), where D is the dimensions of ``x`` . If
+            ``axis`` or element(s) of ``axis`` is less than 0, it works the
+            same way as :math:`axis + D` . If ``axis`` is None, mean is
+            calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: The result variance with the same dtype as :attr:`input`. 
-            If :attr:`out = None`, returns a new Variable containing the 
-            variance, otherwise returns a reference to the output Variable.
+        Tensor, results of average along ``axis`` of ``x``, with the same data
+        type as ``x``.
 
     Examples:
         .. code-block:: python
 
+            import paddle
             import numpy as np
+
+            paddle.disable_static()
+
+            x = np.array([[[1, 2, 3, 4],
+                           [5, 6, 7, 8],
+                           [9, 10, 11, 12]],
+                          [[13, 14, 15, 16],
+                           [17, 18, 19, 20],
+                           [21, 22, 23, 24]]], 'float32')
+            x = paddle.to_tensor(x)
+            out1 = paddle.mean(x)
+            # [12.5]
+            out2 = paddle.mean(x, axis=-1)
+            # [[ 2.5  6.5 10.5]
+            #  [14.5 18.5 22.5]]
+            out3 = paddle.mean(x, axis=-1, keepdim=True)
+            # [[[ 2.5]
+            #   [ 6.5]
+            #   [10.5]]
+            #  [[14.5]
+            #   [18.5]
+            #   [22.5]]]
+            out4 = paddle.mean(x, axis=[0, 2])
+            # [ 8.5 12.5 16.5]
+    """
+
+    if isinstance(axis, int):
+        axis = [axis]
+    reduce_all = True if axis is None \
+        or len(axis)==0 \
+        or len(axis) == len(x.shape) else False
+    if axis is None or len(axis) == 0:
+        axis = [0]
+
+    if in_dygraph_mode():
+        return core.ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
+                                    'reduce_all', reduce_all)
+
+    check_variable_and_dtype(x, 'x/input', ['float32', 'float64'],
+                             'mean/reduce_mean')
+    check_type(axis, 'axis/dim', (int, list, tuple), 'mean/reduce_mean')
+    if isinstance(axis, (list, tuple)):
+        for item in axis:
+            check_type(item, 'elements of axis/dim', (int), 'mean/reduce_mean')
+
+    helper = LayerHelper('mean', **locals())
+    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='reduce_mean', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    return out
+
+
+def var(x, axis=None, unbiased=True, keepdim=False, name=None):
+    """
+    Computes the variance of ``x`` along ``axis`` .
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            variance calculations. ``axis`` should be int, list(int) or
+            tuple(int). If ``axis`` is a list/tuple of dimension(s), variance
+            is calculated along all element(s) of ``axis`` . ``axis`` or
+            element(s) of ``axis`` should be in range [-D, D), where D is the
+            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is less
+            than 0, it works the same way as :math:`axis + D` . If ``axis`` is
+            None, variance is calculated over all elements of ``x``. Default
+            is None.
+        unbiased (bool, optional): Whether to use the unbiased estimation. If
+            ``unbiased`` is True, the divisor used in the computation is
+            :math:`N - 1`, where :math:`N` represents the number of elements
+            along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of variance along ``axis`` of ``x``, with the same data
+        type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
             import paddle
-            import paddle.fluid.dygraph as dg
-
-            a = np.array([[1.0, 2.0], [3.0, 4.0]]).astype("float32")
-            with dg.guard():
-                data = dg.to_variable(a)
-                variance = paddle.var(data, axis=[1])
-                print(variance.numpy())   
-                # [0.5 0.5]
+            import numpy as np
+            
+            paddle.disable_static()
+
+            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
+            x = paddle.to_tensor(x)
+            out1 = paddle.var(x)
+            # [2.66666667]
+            out2 = paddle.var(x, axis=1)
+            # [1.         4.33333333]
     """
-    dtype = convert_dtype(input.dtype)
-    if dtype not in ["float32", "float64"]:
-        raise ValueError("Layer tensor.var() only supports floating-point "
-                         "dtypes, but received {}.".format(dtype))
-    rank = len(input.shape)
-    axes = axis if axis != None and axis != [] else range(rank)
-    axes = [e if e >= 0 else e + rank for e in axes]
-    inp_shape = input.shape if in_dygraph_mode() else layers.shape(input)
-    mean = layers.reduce_mean(input, dim=axis, keep_dim=True, name=name)
-    tmp = layers.reduce_mean(
-        (input - mean)**2, dim=axis, keep_dim=keepdim, name=name)
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
+
+    u = mean(x, axis, True, name)
+    out = paddle.sum((x - u)**2, axis, keepdim=keepdim, name=name)
 
+    n = paddle.cast(paddle.numel(x), x.dtype) \
+        / paddle.cast(paddle.numel(out), x.dtype)
     if unbiased:
-        n = 1
-        for i in axes:
-            n *= inp_shape[i]
-        if not in_dygraph_mode():
-            n = layers.cast(n, dtype)
-            zero_const = layers.fill_constant(shape=[1], dtype=dtype, value=0.0)
-            factor = where(n > 1.0, n / (n - 1.0), zero_const)
-        else:
-            factor = n / (n - 1.0) if n > 1.0 else 0.0
-        tmp *= factor
-    if out:
-        layers.assign(input=tmp, output=out)
-        return out
-    else:
-        return tmp
-
-
-def std(input, axis=None, keepdim=False, unbiased=True, out=None, name=None):
+        one_const = paddle.ones([1], x.dtype)
+        n = where(n > one_const, n - 1., one_const)
+    out /= n
+    return out
+
+
+def std(x, axis=None, unbiased=True, keepdim=False, name=None):
+    """
+    Computes the standard-deviation of ``x`` along ``axis`` .
+
+    Args:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int|list|tuple, optional): The axis along which to perform
+            standard-deviation calculations. ``axis`` should be int, list(int)
+            or tuple(int). If ``axis`` is a list/tuple of dimension(s),
+            standard-deviation is calculated along all element(s) of ``axis`` .
+            ``axis`` or element(s) of ``axis`` should be in range [-D, D),
+            where D is the dimensions of ``x`` . If ``axis`` or element(s) of
+            ``axis`` is less than 0, it works the same way as :math:`axis + D` .
+            If ``axis`` is None, standard-deviation is calculated over all
+            elements of ``x``. Default is None.
+        unbiased (bool, optional): Whether to use the unbiased estimation. If
+            ``unbiased`` is True, the standard-deviation is calculated via the
+            unbiased estimator. If ``unbiased`` is True,  the divisor used in
+            the computation is :math:`N - 1`, where :math:`N` represents the
+            number of elements along ``axis`` , otherwise the divisor is
+            :math:`N`. Default is True.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of standard-deviation along ``axis`` of ``x``, with the
+        same data type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+
+            x = np.array([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
+            x = paddle.to_tensor(x)
+            out1 = paddle.std(x)
+            # [1.63299316]
+            out2 = paddle.std(x, axis=1)
+            # [1.       2.081666]
     """
-	:alias_main: paddle.std
-	:alias: paddle.std,paddle.tensor.std,paddle.tensor.stat.std
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'std')
 
-    Computes the standard-deviation  of the input Variable's elements along the specified 
-    axis.
+    out = var(**locals())
+    return paddle.sqrt(out)
+
+
+def numel(x, name=None):
+    """
+    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static mode
+    or a scalar value in imperative mode
 
     Args:
-        input (Variable): The input Variable to be computed standard-deviation, with data 
-            type float32 and float64 supported.
-        axis (list|int, optional): The axis along which the standard-deviation is computed. 
-            If `None`, compute the standard-deviation over all elements of :attr:`input`
-            and return a Variable with a single element, otherwise it must be in 
-            the range :math:`[-rank(input), rank(input))`. If :math:`axis[i] < 0`, 
-            the axis to compute is :math:`rank(input) + axis[i]`.
-        keepdim (bool, optional): Whether to reserve the reduced dimensions in 
-            the output Variable. The dimensions in :attr:`axis` will be squeezed 
-            and the result Variable will have :attr:`len(axis)` fewer dimensions 
-            than the :attr:`input` unless :attr:`keepdim` is true, default False.
-        unbiased (bool, optional): Whether to compute standard-deviation via the unbiased 
-            estimator, in which the divisor used in the computation is 
-            :math:`N - 1`, where :math:`N` represents the number of elements 
-            along :attr:`axis`, otherwise the divisor is :math:`N`. Default True.
-        out (Variable, optional): Alternate output Variable to store the result
-            standard-deviation . Default None.
-        name (str, optional): The name for this layer. Normally there is no 
-            need for user to set this property.  For more information, please 
-            refer to :ref:`api_guide_Name`. Default None.
+        x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
 
     Returns:
-        Variable: The result standard-deviation  with the same dtype as :attr:`input`. 
-            If :attr:`out = None`, returns a new Variable containing the 
-            standard-deviation , otherwise returns a reference to the output Variable.
+        Tensor: The number of elements for the input Tensor.
+    
+    Raises:
+        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
+
+
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            paddle.std(x)  # [0.28252685] 
-            paddle.std(x, axis=[0])  # [0.0707107, 0.07071075, 0.07071064, 0.1414217]
-            paddle.std(x, axis=[-1])  # [0.30956957, 0.29439208] 
+            
+            paddle.disable_static()
+            x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
+            numel = paddle.numel(x) # 140
+
+
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'std')
-
-    tmp = var(input, axis=axis, keepdim=keepdim, unbiased=unbiased, name=name)
-    tmp = layers.sqrt(tmp)
-    if out is not None:
-        layers.assign(input=tmp, output=out)
-        return out
-    else:
-        return tmp
+    if in_dygraph_mode():
+        return core.ops.size(x)
+
+    if not isinstance(x, Variable):
+        raise TypeError("x must be a Tensor in numel")
+    helper = LayerHelper('numel', **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64)
+    helper.append_op(type='size', inputs={'Input': x}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4
--- /dev/null
+++ b/python/paddle/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/tests/test_metrics.py b/python/paddle/tests/test_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..2272a81b3f602ec46972c9d4620ded9680e2ff5f
--- /dev/null
+++ b/python/paddle/tests/test_metrics.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
+from paddle.incubate.hapi.utils import to_list
+
+
+def accuracy(pred, label, topk=(1, )):
+    maxk = max(topk)
+    pred = np.argsort(pred)[:, ::-1][:, :maxk]
+    correct = (pred == np.repeat(label, maxk, 1))
+
+    batch_size = label.shape[0]
+    res = []
+    for k in topk:
+        correct_k = correct[:, :k].sum()
+        res.append(float(correct_k) / batch_size)
+    return res
+
+
+def convert_to_one_hot(y, C):
+    oh = np.random.choice(np.arange(C), C, replace=False).astype('float32') / C
+    oh = np.tile(oh[np.newaxis, :], (y.shape[0], 1))
+    for i in range(y.shape[0]):
+        oh[i, int(y[i])] = 1.
+    return oh
+
+
+class TestAccuracy(unittest.TestCase):
+    def test_acc(self):
+        paddle.disable_static()
+
+        x = paddle.to_tensor(
+            np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.4, 0.3, 0.2],
+                      [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+
+        m = paddle.metric.Accuracy(name='my_acc')
+
+        # check name
+        self.assertEqual(m.name(), ['my_acc'])
+
+        correct = m.compute(x, y)
+        # check results
+        self.assertEqual(m.update(correct), 0.75)
+        self.assertEqual(m.accumulate(), 0.75)
+
+        x = paddle.to_tensor(
+            np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.3, 0.4, 0.2],
+                      [0.1, 0.2, 0.4, 0.3], [0.1, 0.2, 0.3, 0.4]]))
+        y = paddle.to_tensor(np.array([[0], [1], [2], [3]]))
+        correct = m.compute(x, y)
+        # check results
+        self.assertEqual(m.update(correct), 0.5)
+        self.assertEqual(m.accumulate(), 0.625)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.total[0], 0.0)
+        self.assertEqual(m.count[0], 0.0)
+        paddle.enable_static()
+
+
+class TestAccuracyDynamic(unittest.TestCase):
+    def setUp(self):
+        self.topk = (1, )
+        self.class_num = 5
+        self.sample_num = 1000
+        self.name = None
+
+    def random_pred_label(self):
+        label = np.random.randint(0, self.class_num,
+                                  (self.sample_num, 1)).astype('int64')
+        pred = np.random.randint(0, self.class_num,
+                                 (self.sample_num, 1)).astype('int32')
+        pred_one_hot = convert_to_one_hot(pred, self.class_num)
+        pred_one_hot = pred_one_hot.astype('float32')
+
+        return label, pred_one_hot
+
+    def test_main(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
+            for _ in range(10):
+                label, pred = self.random_pred_label()
+                label_var = paddle.to_tensor(label)
+                pred_var = paddle.to_tensor(pred)
+                state = to_list(acc.compute(pred_var, label_var))
+                acc.update(* [s.numpy() for s in state])
+                res_m = acc.accumulate()
+                res_f = accuracy(pred, label, self.topk)
+                assert np.all(np.isclose(np.array(res_m, dtype='float64'),
+                              np.array(res_f, dtype='float64'), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+                acc.reset()
+                assert np.sum(acc.total) == 0
+                assert np.sum(acc.count) == 0
+
+
+class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 1000
+        self.name = "accuracy"
+
+
+class TestAccuracyStatic(TestAccuracyDynamic):
+    def test_main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        main_prog.random_seed = 1024
+        startup_prog.random_seed = 1024
+        with fluid.program_guard(main_prog, startup_prog):
+            pred = fluid.data(
+                name='pred', shape=[None, self.class_num], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
+            state = acc.compute(pred, label)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        compiled_main_prog = fluid.CompiledProgram(main_prog)
+
+        for _ in range(10):
+            label, pred = self.random_pred_label()
+            state_ret = exe.run(compiled_main_prog,
+                                feed={'pred': pred,
+                                      'label': label},
+                                fetch_list=[s.name for s in to_list(state)],
+                                return_numpy=True)
+            acc.update(*state_ret)
+            res_m = acc.accumulate()
+            res_f = accuracy(pred, label, self.topk)
+            assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+            acc.reset()
+            assert np.sum(acc.total) == 0
+            assert np.sum(acc.count) == 0
+
+
+class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 100
+        self.name = "accuracy"
+
+
+class TestPrecision(unittest.TestCase):
+    def test_1d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = paddle.to_tensor(np.array([0.1, 0.5, 0.6, 0.7, 0.2]))
+        y = paddle.to_tensor(np.array([1, 0, 1, 1, 1]))
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 4. / 6.)
+
+        paddle.enable_static()
+
+    def test_2d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7]).reshape(-1, 1)
+        y = np.array([1, 0, 1, 1]).reshape(-1, 1)
+
+        m = paddle.metric.Precision()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = np.array([0.1, 0.5, 0.6, 0.7, 0.2]).reshape(-1, 1)
+        y = np.array([1, 0, 1, 1, 1]).reshape(-1, 1)
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 4. / 6.)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.tp, 0.0)
+        self.assertEqual(m.fp, 0.0)
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+
+class TestRecall(unittest.TestCase):
+    def test_1d(self):
+        paddle.disable_static()
+
+        x = np.array([0.1, 0.5, 0.6, 0.7])
+        y = np.array([1, 0, 1, 1])
+
+        m = paddle.metric.Recall()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 2. / 3.)
+
+        x = paddle.to_tensor(np.array([0.1, 0.5, 0.6, 0.7]))
+        y = paddle.to_tensor(np.array([1, 0, 0, 1]))
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 3. / 5.)
+
+        # check reset
+        m.reset()
+        self.assertEqual(m.tp, 0.0)
+        self.assertEqual(m.fn, 0.0)
+        self.assertEqual(m.accumulate(), 0.0)
+        paddle.enable_static()
+
+
+class TestAuc(unittest.TestCase):
+    def test_auc_numpy(self):
+        paddle.disable_static()
+        x = np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
+                      [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]])
+        y = np.array([[0], [1], [1], [0], [1], [0], [0], [1]])
+        m = paddle.metric.Auc()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 0.8125)
+
+        m.reset()
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+    def test_auc_tensor(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(
+            np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
+                      [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]]))
+        y = paddle.to_tensor(np.array([[0], [1], [1], [0], [1], [0], [0], [1]]))
+        m = paddle.metric.Auc()
+        m.update(x, y)
+        r = m.accumulate()
+        self.assertAlmostEqual(r, 0.8125)
+
+        m.reset()
+        self.assertEqual(m.accumulate(), 0.0)
+
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 27621c2d872a6d10ec3259312abe318fef5b334b..08fd7e33479b331454f63f05f6240dd221591ee9 100644
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -19,6 +19,14 @@ import warnings
 import functools
 import paddle
 
+# NOTE(zhiqiu): Since python 3.2, DeprecationWarning is ignored by default,
+# and since python 3.7, it is once again shown by default when triggered directly by code in __main__.
+# See details: https://docs.python.org/3/library/warnings.html#default-warning-filter
+# The following line set DeprecationWarning to show once, which is expected to work in python 3.2 -> 3.6
+# However, doing this could introduce one samll side effect, i.e., the DeprecationWarning which is not issued by @deprecated.
+# The side effect is acceptable, and we will find better way to do this if we could.
+warnings.simplefilter('default', DeprecationWarning)
+
 
 def deprecated(update_to="", since="", reason=""):
     """Decorate a function to signify its deprecation.
@@ -36,6 +44,8 @@ def deprecated(update_to="", since="", reason=""):
     """
 
     def decorator(func):
+        # TODO(zhiqiu): We temporally disable the warnings for 2.0-bata, and it should be re-enabled in the future.
+        return func
         """construct warning message, and return a decorated function or class."""
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
@@ -54,7 +64,7 @@ def deprecated(update_to="", since="", reason=""):
                 "paddle."
             ), 'Argument update_to must start with "paddle.", your value is "{}"'.format(
                 update_to)
-            msg += ' Use "{}" instead.'.format(_update_to)
+            msg += ' Please use "{}" instead.'.format(_update_to)
         if len(_reason) > 0:
             msg += "\n reason: {}".format(_reason)
 
@@ -70,11 +80,8 @@ def deprecated(update_to="", since="", reason=""):
             v_since = [int(i) for i in _since.split(".")]
             v_since += [0] * (4 - len(v_since))
             if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
-                warnings.simplefilter('always',
-                                      DeprecationWarning)  # turn off filter
                 warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
-                warnings.simplefilter('default',
-                                      DeprecationWarning)  # reset filter
+
             return func(*args, **kwargs)
 
         return wrapper
diff --git a/python/requirements.txt b/python/requirements.txt
index 5e081f5e85b6e0f645991ab70874d04ab93e3106..28c84c1d630a66077a737a48bb8a26200ec48f0b 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,12 +1,13 @@
+opencv-python<=4.2.0.32
 requests>=2.20.0
-numpy>=1.12, <=1.16.4 ; python_version<"3.5"
-numpy>=1.12 ; python_version>="3.5"
+numpy>=1.13, <=1.16.4 ; python_version<"3.5"
+numpy>=1.13 ; python_version>="3.5"
 protobuf>=3.1.0
-gast>=0.3.3
+gast==0.3.3
 matplotlib<=2.2.4 ; python_version<"3.6"
 scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
 nltk>=3.2.2, <=3.4 ; python_version<"3.5"
-matplotlib ; python_version>="3.6"
+matplotlib<=3.2.1 ; python_version>="3.6"
 scipy<=1.3.1 ; python_version=="3.5"
 scipy ; python_version>"3.5"
 nltk ; python_version>="3.5"
@@ -21,3 +22,4 @@ prettytable
 objgraph
 astor
 pathlib
+netifaces
diff --git a/python/setup.py.in b/python/setup.py.in
index df200da2cfc5b927402b2ed183eff5038aec8764..5b206296bd641bf909115d1c580518afe85a37b6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -145,17 +145,20 @@ packages=['paddle',
           'paddle.incubate',
           'paddle.incubate.complex',
           'paddle.incubate.complex.tensor',
-          'paddle.fleet',
-          'paddle.fleet.base',
-          'paddle.fleet.meta_optimizers',
-          'paddle.fleet.runtime',
-          'paddle.fleet.dataset',
-          'paddle.fleet.metrics',
-          'paddle.fleet.proto',
+          'paddle.distributed.fleet',
+          'paddle.distributed.fleet.base',
+          'paddle.distributed.fleet.meta_optimizers',
+          'paddle.distributed.fleet.runtime',
+          'paddle.distributed.fleet.dataset',
+          'paddle.distributed.fleet.metrics',
+          'paddle.distributed.fleet.proto',
+          'paddle.distributed.fleet.utils',
           'paddle.framework',
+          'paddle.jit',
           'paddle.fluid',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
+          'paddle.fluid.dygraph.amp',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.distributed',
@@ -177,6 +180,7 @@ packages=['paddle',
           'paddle.fluid.incubate',
           'paddle.fluid.incubate.data_generator',
           'paddle.fluid.incubate.fleet',
+          'paddle.fluid.incubate.checkpoint',
           'paddle.fluid.incubate.fleet.base',
           'paddle.fluid.incubate.fleet.parameter_server',
           'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
@@ -197,9 +201,10 @@ packages=['paddle',
           'paddle.nn.functional',
           'paddle.nn.layer',
           'paddle.nn.initializer',
+          'paddle.nn.utils',
           'paddle.metric',
-          'paddle.imperative',
-          'paddle.imperative.jit',
+          'paddle.static',
+          'paddle.static.nn',
           'paddle.tensor',
           ]
 
@@ -298,6 +303,23 @@ if '${WITH_MKLDNN}' == 'ON':
     else:
         package_data['paddle.libs']+=['mkldnn.dll']
 
+if '${WITH_XPU}' == 'ON':
+    # only change rpath in Release mode,
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command)
+    shutil.copy('${XPU_API_LIB}', libs_path)
+    shutil.copy('${XPU_RT_LIB}', libs_path)
+    shutil.copy('${XPU_SIM_LIB}', libs_path)
+    package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
+                                  '${XPU_RT_LIB_NAME}',
+                                  '${XPU_SIM_LIB_NAME}']
+
 # copy libfuild_framework.so to libs
 if os.name != 'nt' and sys.platform != 'darwin':
     paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
@@ -478,7 +500,7 @@ with redirect_stdout():
         },
         entry_points={
             'console_scripts': [
-                'fleetrun = paddle.fleet.launch:launch'
+                'fleetrun = paddle.distributed.fleet.launch:launch'
             ]
         }
     )
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index b1f3f84b36ee295529661cf74b13e71d620254c9..f7ee09e11ea5e3a3b5ba4ce6b3be8af4abe7cae4 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -5,48 +5,10 @@ if [ -z ${BRANCH} ]; then
 fi
 
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-API_FILES=("CMakeLists.txt"
-           "paddle/fluid/framework/operator.h"
-           "paddle/fluid/framework/tensor.h"
-           "paddle/fluid/framework/details/op_registry.h"
-           "paddle/fluid/framework/grad_op_desc_maker.h"
-           "paddle/fluid/framework/lod_tensor.h"
-           "paddle/fluid/framework/selected_rows.h"
-           "paddle/fluid/framework/op_desc.h"
-           "paddle/fluid/framework/block_desc.h"
-           "paddle/fluid/framework/var_desc.h"
-           "paddle/fluid/framework/scope.h"
-           "paddle/fluid/framework/ir/node.h"
-           "paddle/fluid/framework/ir/graph.h"
-           "paddle/fluid/framework/framework.proto"
-	   "python/paddle/fleet/__init__.py"
-           "python/requirements.txt"
-           "python/paddle/fluid/__init__.py"
-           "python/paddle/fluid/compiler.py"
-           "python/paddle/fluid/parallel_executor.py"
-           "python/paddle/fluid/framework.py"
-           "python/paddle/fluid/backward.py"
-           "paddle/fluid/operators/distributed/send_recv.proto.in"
-           "paddle/fluid/framework/unused_var_check.cc"
-           "paddle/fluid/pybind/op_function_generator.cc"
-           "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
-           "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
-           "tools/wlist.json"
-           )
-
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-git_files=`git diff --numstat upstream/$BRANCH| wc -l`
-git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
 failed_num=0
 echo_list=()
 
-
 function check_approval(){
     person_num=`echo $@|awk '{for (i=2;i<=NF;i++)print $i}'`
     APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py $1 $person_num`
@@ -55,18 +17,12 @@ function check_approval(){
     fi
 }
 
-
 function add_failed(){
     failed_num=`expr $failed_num + 1`
     echo_list="${echo_list[@]}$1"
 }
 
 
-if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
-    check_approval 1 38231817
-fi    
-
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` 
 if [ "$api_spec_diff" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01 or lanxianghit) and one TPM (saxon-zh or jzhang533 or swtkiwi or Heeenrrry or TCChenlong) approval for the api change for the management reason of API interface.\n"
@@ -101,213 +57,6 @@ if [ "$op_desc_diff" != "" ]; then
     check_approval 1 33742067 7913861 9301846 47554610 43953930
 fi
 
-for API_FILE in ${API_FILES[*]}; do
-  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
-  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
-      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-      # You can use http://caius.github.io/github_id/ to find Github user id.
-      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059,Boyan-Liu 31623103, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, JepsonWong 16509038, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676. Dong Daxiang 35550832.
-      if [ "${API_FILE}" == "CMakeLists.txt" ];then
-          echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          check_approval 1 6836917 46782768
-      elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          check_approval 1 6836917 47554610
-      elif [ "${API_FILE}" == "python/requirements.txt" ];then
-          echo_line="You must have one RD (kolinwei (Recommend), JepsonWong or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
-          check_approval 1 22165420 16509038 6836917
-      elif [ "${API_FILE}" == "paddle/fluid/operators/distributed/send_recv.proto.in" ];then
-          echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
-          check_approval 1 10721757 5442383
-      elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
-          echo_line="You must have one RD (zhiqiu (Recommend) , sneaxiy or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
-          check_approval 1 6888866 32832641 6836917
-      elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then
-          echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n"
-          check_approval 1 6888866 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py" ];then
-          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (hong19860320 (Recommend), luotao1, phlrain) approval for the changes of check_shape_white_list.py, which manages the white list of operators with limited input size. Inputs size of all cases in the op test must be greater than or equal to 100. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/OP-Test-Input-Shape-Requirements. \n"
-          check_approval 1 9973393 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py" ];then
-          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 (Recommend) or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py, which manages the white list of upgrading the precision of op test to float64. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
-          check_approval 1 52520497 26615455 6836917
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py" ];then
-           echo_line="You must have one RD (DannyIsFunny (Recommend), luotao1, phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py, which manages the white list of compile&runtime lod-level check. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Compile_vs_Runtime-Check-Specification. \n"
-          check_approval 1 45189361 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py" ];then
-          echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
-          check_approval 1 12407750 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
-          echo_line="You must have one RD (JepsonWong (Recommend), luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
-          check_approval 1 16509038 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then
-          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
-          check_approval 1 52520497 26615455 6836917
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then
-          echo_line="You must have one RD (songyouwei, luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
-          check_approval 1 2573291 6836917 43953930
-      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
-        echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
-        check_approval 1 39303645 6836917 43953930
-      elif [ "${API_FILE}" == "tools/wlist.json" ];then
-        echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
-        check_approval 1 29231
-      elif [ "${API_FILE}" == "python/paddle/fleet/__init__.py" ]; then
-	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
-	check_approval 1 35550832 38231817
-      else
-          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          check_approval 1 3048612 46782768 12538138 6836917 32832641
-      fi
-  fi
-done
-
-HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
-if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for the usage (either add or delete) of const_cast.\n"
-    check_approval 1 3048612 46782768 12538138 6836917 32832641
-fi
-
-HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -o -m 1 "boost::get" || true`
-if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="boost::get is not recommended, because it may throw an bad_get exception without any stack information, so please use BOOST_GET(_**)(dtype, value) series macros here. If these macros cannot meet your needs, please use try-catch to handle boost::get and request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
-if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
-if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag.\n"
-    check_approval 1 47554610
-fi
-
-HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
-if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), liuwei1031, or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762
-  fi
-
-HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
-if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
-    check_approval 1 328693 6836917
-  fi
-
-ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
-if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
-VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
-INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
-if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
-    check_approval 1 6836917 47554610 22561442
-fi
-
-ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
-ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""
-for CHANGE_FILE in ${ALL_CHANGE_FILES}; do
-    ALL_OPTEST_BAN_DYGRAPH=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CHANGE_FILE} | grep "+" | grep "check_dygraph=" || true`
-    if [ "${ALL_OPTEST_BAN_DYGRAPH}" != "" ]; then
-        ALL_OPTEST_BAN_DYGRAPH_MESSAGE="${ALL_OPTEST_BAN_DYGRAPH_MESSAGE} ${CHANGE_FILE} : \n${ALL_OPTEST_BAN_DYGRAPH} \n"
-    fi
-done
-if [ "${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Developers are not allowed to set the check_dygraph field directly, which is set to True by default. If you need to change the check_dygraph field, you must have one RD (phlrain (Recommend) or lanxianghit) review and approve. \nThe code that do not meet the specification are as follows:\n${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}\n"
-    check_approval 1 43953930 47554610
-fi
-
-NEW_OP_ADDED=`git diff --name-only --diff-filter=A upstream/$BRANCH |grep -oE ".+_op..*" || true`
-if [ "${NEW_OP_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    GET_KERNEL_TYPE_FUNC_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -czoE "GetExpectedKernelType[(][^(){}]+[)][^{]+[{][^}]+[}]" || true`
-    INDICATE_VAR_DTYPE_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -co "IndicateVarDataType" || true`
-    if [ ${GET_KERNEL_TYPE_FUNC_CNT} -gt ${INDICATE_VAR_DTYPE_CNT} ]; then
-        echo_line="If you override GetExpectedKernelType method of OperatorWithKernel, please use OperatorWithKernel::IndicateVarDataType() method to get specific input variable's dtype, which checked whether the input variable is initialized (The details in https://github.com/PaddlePaddle/FluidDoc/pull/1527). If you don't use this method to check, you must have one RD (chenwhql (Recommend) , luotao1 or lanxianghit) approval for the usage of other methods.\n"
-        check_approval 1 6836917 47554610 22561442
-    fi
-fi
-
-HAS_OPERATORBASE_FLAG=`git diff -U0 --diff-filter=A upstream/$BRANCH | grep -E "public[[:space:]]+.*OperatorBase" || true`
-if [ "${HAS_OPERATORBASE_FLAG}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="In order to support dynamic graph, all ops are not recommended to inherit OperatorBase. Please use OperatorWithKernel instead.\nYou must have one RD (phlrain (Recommend), luotao1, lanxianghit or XiaoguangHu01) approval for the inherit of OperatorBase.\nYou inherit the OperatorBase class. The corresponding lines are as follows:\n${HAS_OPERATORBASE_FLAG}"
-    check_approval 1 43953930 6836917 47554610 46782768
-fi
-
-HAS_INPLACE_TESTS=`git diff -U0 upstream/$BRANCH |grep "+" |grep -E "inplace_atol[[:space:]]*=.*" || true`
-if [ "${HAS_INPLACE_TESTS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="The calculation results of setting inplace enabled and disabled must be equal, that is, it's not recommended to set inplace_atol.\n If you do need to use inplace_atol, you must have one RD (XiaoguangHu01, lanxianghit, phlrain, luotao1) approval for the usage of inplace_atol.\nThe corresponding lines are as follows:\n${HAS_INPLACE_TESTS}\n"
-    check_approval 1 46782768 47554610 43953930 6836917
-fi
-
-OP_FILE_CHANGED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE ".+_op..*" || true`
-if [ "${OP_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    for OP_FILE in ${OP_FILE_CHANGED};
-    do
-        CHECK_OBJECT_FLAGS=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${OP_FILE} |grep "+" |grep -E "ShareDataWith[(]|ShareBufferWith[(]" || true`
-        if [ "${CHECK_OBJECT_FLAGS}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${OP_FILE}${CHECK_OBJECT_FLAGS}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="Using ShareDataWith or ShareBufferWith is not recommended. You must have one RD's (zhhsplendid (Recommend), sneaxiy or luotao1 or lanxianghit) approval to use these methods. For more information, please refer to https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-is-prohibited-in-OP. The error lines are as follows:${ERROR_LINES}"
-        check_approval 1 6836917 32832641 47554610 7913861
-    fi
-fi
-
-NEW_OP_TEST_ADDED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE "test_.*.\.py" || true`
-if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    CHECK_OUTPUT=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep "self\.check_output(a*t*o*l*=*[0-9]"|grep "+" || true`
-    CHECK_OUTPUT_WITH_PLACE=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 "self\.check_output_with_place" |grep ", [atol*,0-9]"|grep "+" || true`
-    CHECK_GRAD=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A5 -E "self\.check_grad|self\.check_grad_with_place"|grep "max_relative_error=" |grep "+" || true`
-    CHECK_GRAD_CHECK=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 -E "checker\.double_grad_check"|grep "eps=|atol=|rtol=" |grep "+" || true`
-    CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
-    if [ "${CHECK_WHOLE}" != "" ] ; then
-        CHECK_OP=${CHECK_WHOLE//+/'\n+'}       
-        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
-        check_approval 1 6836917 47554610 12538138 43953930
-    fi
-fi
-
-UNITTEST_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "test_.*.\.py" || true`
-if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
-    do
-        HAS_SKIP_CHECK_GRAD_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "@skip_check_grad_ci" || true`
-        if [ "${HAS_SKIP_CHECK_GRAD_CI}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_GRAD_CI}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (zhangting2020 (Recommend), luotao1 or phlrain) approval for the usage (either add or delete) of @skip_check_grad_ci. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Gradient-Check-Is-Required-for-Op-Test. The corresponding lines are as follows:\n${ERROR_LINES}\n"
-        check_approval 1 26615455 6836917 43953930
-    fi
-fi
-
-RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
-if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
-    do
-        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|PROPERTIES[[:space:]]+TIMEOUT" || true`
-	if [[ ${RUNTYPE_ADD} != "" ]];then
-	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
-	fi
-    done
-    if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
-        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE to EXCLUSIVE or DIST, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
-	check_approval 1 32428676 45041955
-    fi
-fi
-
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
 ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}` 
@@ -316,16 +65,6 @@ if [ "${ADDED_OP_USE_DEFAULT_GRAD_MAKER}" != "" ]; then
   check_approval 1 32832641 6836917
 fi
 
-# Get the list of PR authors with unresolved unit test issues
-pip install PyGithub
-# For getting PR related data
-wget https://paddle-ci.gz.bcebos.com/blk/block.txt
-HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
-if [ "${HASUTFIXED}" != "" ]; then
-  echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
-  check_approval 1 45041955 22165420
-fi
-
 if [ -n "${echo_list}" ];then
   echo "****************"
   echo -e "${echo_list[@]}"
@@ -336,5 +75,5 @@ fi
 python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec
 python ${PADDLE_ROOT}/tools/check_op_register_type.py ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_PR.spec
 if [ -n "${echo_list}" ]; then
-  exit 1
+  exit 6
 fi
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2c575e4abf1beed039d3293821b8df356d4e9295
--- /dev/null
+++ b/tools/check_file_diff_approvals.sh
@@ -0,0 +1,301 @@
+#!/bin/bash
+
+if [ -z ${BRANCH} ]; then
+    BRANCH="develop"
+fi
+
+PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+API_FILES=("CMakeLists.txt"
+           "paddle/fluid/framework/operator.h"
+           "paddle/fluid/framework/tensor.h"
+           "paddle/fluid/framework/details/op_registry.h"
+           "paddle/fluid/framework/grad_op_desc_maker.h"
+           "paddle/fluid/framework/lod_tensor.h"
+           "paddle/fluid/framework/selected_rows.h"
+           "paddle/fluid/framework/op_desc.h"
+           "paddle/fluid/framework/block_desc.h"
+           "paddle/fluid/framework/var_desc.h"
+           "paddle/fluid/framework/scope.h"
+           "paddle/fluid/framework/ir/node.h"
+           "paddle/fluid/framework/ir/graph.h"
+           "paddle/fluid/framework/framework.proto"
+	   "python/paddle/distributed/__init"
+	   "python/paddle/distributed/fleet/__init__.py"
+           "python/requirements.txt"
+           "python/paddle/fluid/__init__.py"
+           "python/paddle/fluid/compiler.py"
+           "python/paddle/fluid/parallel_executor.py"
+           "python/paddle/fluid/framework.py"
+           "python/paddle/fluid/backward.py"
+           "paddle/fluid/operators/distributed/send_recv.proto.in"
+           "paddle/fluid/framework/unused_var_check.cc"
+           "paddle/fluid/pybind/op_function_generator.cc"
+           "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
+           "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
+           "tools/wlist.json"
+           )
+
+approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+git_files=`git diff --numstat upstream/$BRANCH| wc -l`
+git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
+failed_num=0
+echo_list=()
+
+
+function check_approval(){
+    person_num=`echo $@|awk '{for (i=2;i<=NF;i++)print $i}'`
+    APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py $1 $person_num`
+    if [[ "${APPROVALS}" == "FALSE" && "${echo_line}" != "" ]]; then
+        add_failed "${failed_num}. ${echo_line}"
+    fi
+}
+
+
+function add_failed(){
+    failed_num=`expr $failed_num + 1`
+    echo_list="${echo_list[@]}$1"
+}
+
+
+if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
+    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
+    check_approval 1 38231817
+fi
+
+for API_FILE in ${API_FILES[*]}; do
+  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeLists.txt" || true`
+  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
+      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+      # You can use http://caius.github.io/github_id/ to find Github user id.
+      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676. Dong Daxiang 35550832.
+      if [ "${API_FILE}" == "CMakeLists.txt" ];then
+          echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
+          check_approval 1 6836917 46782768
+      elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
+          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
+          check_approval 1 6836917 47554610
+      elif [ "${API_FILE}" == "python/requirements.txt" ];then
+          echo_line="You must have one RD (kolinwei (Recommend) or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
+          check_approval 1 22165420 6836917
+      elif [ "${API_FILE}" == "paddle/fluid/operators/distributed/send_recv.proto.in" ];then
+          echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
+          check_approval 1 10721757 5442383
+      elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
+          echo_line="You must have one RD (zhiqiu (Recommend) or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
+          check_approval 1 6888866 6836917
+      elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then
+          echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n"
+          check_approval 1 6888866 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py" ];then
+          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (hong19860320 (Recommend), luotao1, phlrain) approval for the changes of check_shape_white_list.py, which manages the white list of operators with limited input size. Inputs size of all cases in the op test must be greater than or equal to 100. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/OP-Test-Input-Shape-Requirements. \n"
+          check_approval 1 9973393 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py" ];then
+          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 (Recommend) or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py, which manages the white list of upgrading the precision of op test to float64. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
+          check_approval 1 52520497 26615455 6836917
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py" ];then
+           echo_line="You must have one RD (DannyIsFunny (Recommend), luotao1, phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/compile_vs_runtime_white_list.py, which manages the white list of compile&runtime lod-level check. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Compile_vs_Runtime-Check-Specification. \n"
+          check_approval 1 45189361 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py" ];then
+          echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
+          check_approval 1 12407750 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
+          echo_line="You must have one RD (luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then
+          echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
+          check_approval 1 52520497 26615455 6836917
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then
+          echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930
+      elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
+        echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
+        check_approval 1 39303645 6836917 43953930
+      elif [ "${API_FILE}" == "tools/wlist.json" ];then
+        echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
+        check_approval 1 29231
+      elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
+	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	check_approval 1 35550832 38231817
+      elif [ "${API_FILE}" == "python/paddle/distributed/__init__.py" ]; then
+	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	check_approval 1 35550832 38231817
+      else
+          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
+          check_approval 1 3048612 46782768 12538138 6836917
+      fi
+  fi
+done
+
+FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
+HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER |grep -o -m 1 "const_cast" || true`
+if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for the usage (either add or delete) of const_cast.\n"
+    check_approval 1 3048612 46782768 12538138 6836917
+fi
+
+HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true`
+if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="boost::get is not recommended, because it may throw an bad_get exception without any stack information, so please use BOOST_GET(_**)(dtype, value) series macros here. If these macros cannot meet your needs, please use try-catch to handle boost::get and request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
+if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+HAS_DEFINE_FLAG=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "DEFINE_int32" |grep -o -m 1 "DEFINE_bool" | grep -o -m 1 "DEFINE_string" || true`
+if [ ${HAS_DEFINE_FLAG} ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD lanxianghit approval for the usage (either add or delete) of DEFINE_int32/DEFINE_bool/DEFINE_string flag.\n"
+    check_approval 1 47554610
+fi
+
+HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
+if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), liuwei1031, or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762
+  fi
+
+HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
+if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
+    check_approval 1 328693 6836917
+  fi
+
+ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
+if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
+INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
+if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
+    check_approval 1 6836917 47554610 22561442
+fi
+
+ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
+ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""
+for CHANGE_FILE in ${ALL_CHANGE_FILES}; do
+    ALL_OPTEST_BAN_DYGRAPH=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CHANGE_FILE} | grep "+" | grep "check_dygraph=" || true`
+    if [ "${ALL_OPTEST_BAN_DYGRAPH}" != "" ]; then
+        ALL_OPTEST_BAN_DYGRAPH_MESSAGE="${ALL_OPTEST_BAN_DYGRAPH_MESSAGE} ${CHANGE_FILE} : \n${ALL_OPTEST_BAN_DYGRAPH} \n"
+    fi
+done
+if [ "${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="Developers are not allowed to set the check_dygraph field directly, which is set to True by default. If you need to change the check_dygraph field, you must have one RD (phlrain (Recommend) or lanxianghit) review and approve. \nThe code that do not meet the specification are as follows:\n${ALL_OPTEST_BAN_DYGRAPH_MESSAGE}\n"
+    check_approval 1 43953930 47554610
+fi
+
+NEW_OP_ADDED=`git diff --name-only --diff-filter=A upstream/$BRANCH |grep -oE ".+_op..*" || true`
+if [ "${NEW_OP_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    GET_KERNEL_TYPE_FUNC_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -czoE "GetExpectedKernelType[(][^(){}]+[)][^{]+[{][^}]+[}]" || true`
+    INDICATE_VAR_DTYPE_CNT=`git diff -U0 --diff-filter=A upstream/$BRANCH |grep "+" |grep -co "IndicateVarDataType" || true`
+    if [ ${GET_KERNEL_TYPE_FUNC_CNT} -gt ${INDICATE_VAR_DTYPE_CNT} ]; then
+        echo_line="If you override GetExpectedKernelType method of OperatorWithKernel, please use OperatorWithKernel::IndicateVarDataType() method to get specific input variable's dtype, which checked whether the input variable is initialized (The details in https://github.com/PaddlePaddle/FluidDoc/pull/1527). If you don't use this method to check, you must have one RD (chenwhql (Recommend) , luotao1 or lanxianghit) approval for the usage of other methods.\n"
+        check_approval 1 6836917 47554610 22561442
+    fi
+fi
+
+HAS_OPERATORBASE_FLAG=`git diff -U0 --diff-filter=A upstream/$BRANCH | grep -E "public[[:space:]]+.*OperatorBase" || true`
+if [ "${HAS_OPERATORBASE_FLAG}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="In order to support dynamic graph, all ops are not recommended to inherit OperatorBase. Please use OperatorWithKernel instead.\nYou must have one RD (phlrain (Recommend), luotao1, lanxianghit or XiaoguangHu01) approval for the inherit of OperatorBase.\nYou inherit the OperatorBase class. The corresponding lines are as follows:\n${HAS_OPERATORBASE_FLAG}"
+    check_approval 1 43953930 6836917 47554610 46782768
+fi
+
+HAS_INPLACE_TESTS=`git diff -U0 upstream/$BRANCH |grep "+" |grep -E "inplace_atol[[:space:]]*=.*" || true`
+if [ "${HAS_INPLACE_TESTS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="The calculation results of setting inplace enabled and disabled must be equal, that is, it's not recommended to set inplace_atol.\n If you do need to use inplace_atol, you must have one RD (XiaoguangHu01, lanxianghit, phlrain, luotao1) approval for the usage of inplace_atol.\nThe corresponding lines are as follows:\n${HAS_INPLACE_TESTS}\n"
+    check_approval 1 46782768 47554610 43953930 6836917
+fi
+
+OP_FILE_CHANGED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE ".+_op..*" || true`
+if [ "${OP_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for OP_FILE in ${OP_FILE_CHANGED};
+    do
+        CHECK_OBJECT_FLAGS=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${OP_FILE} |grep "+" |grep -E "ShareDataWith[(]|ShareBufferWith[(]" || true`
+        if [ "${CHECK_OBJECT_FLAGS}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${OP_FILE}${CHECK_OBJECT_FLAGS}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="Using ShareDataWith or ShareBufferWith is not recommended. You must have one RD's (zhhsplendid (Recommend), zhiqiu or luotao1 or lanxianghit) approval to use these methods. For more information, please refer to https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-is-prohibited-in-OP. The error lines are as follows:${ERROR_LINES}"
+        check_approval 1 6836917 6888866 47554610 7913861
+    fi
+fi
+
+NEW_OP_TEST_ADDED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE "test_.*.\.py" || true`
+if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    CHECK_OUTPUT=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep "self\.check_output(a*t*o*l*=*[0-9]"|grep "+" || true`
+    CHECK_OUTPUT_WITH_PLACE=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 "self\.check_output_with_place" |grep ", [atol*,0-9]"|grep "+" || true`
+    CHECK_GRAD=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A5 -E "self\.check_grad|self\.check_grad_with_place"|grep "max_relative_error=" |grep "+" || true`
+    CHECK_GRAD_CHECK=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep -A2 -E "checker\.double_grad_check"|grep "eps=|atol=|rtol=" |grep "+" || true`
+    CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
+    if [ "${CHECK_WHOLE}" != "" ] ; then
+        CHECK_OP=${CHECK_WHOLE//+/'\n+'}       
+        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
+        check_approval 1 6836917 47554610 12538138 43953930
+    fi
+fi
+
+UNITTEST_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "test_.*.\.py" || true`
+if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
+    do
+        HAS_SKIP_CHECK_GRAD_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "@skip_check_grad_ci" || true`
+        if [ "${HAS_SKIP_CHECK_GRAD_CI}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_GRAD_CI}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (zhangting2020 (Recommend), luotao1 or phlrain) approval for the usage (either add or delete) of @skip_check_grad_ci. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Gradient-Check-Is-Required-for-Op-Test. The corresponding lines are as follows:\n${ERROR_LINES}\n"
+        check_approval 1 26615455 6836917 43953930
+    fi
+fi
+
+RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
+if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
+    do
+        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|PROPERTIES[[:space:]]+TIMEOUT" || true`
+	if [[ ${RUNTYPE_ADD} != "" ]];then
+	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
+	fi
+    done
+    if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
+        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE to EXCLUSIVE or DIST, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
+	check_approval 1 32428676 45041955
+    fi
+fi
+
+# Get the list of PR authors with unresolved unit test issues
+pip install PyGithub
+# For getting PR related data
+wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
+if [ "${HASUTFIXED}" != "" ]; then
+  echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
+  check_approval 1 45041955 22165420
+fi
+
+if [ -n "${echo_list}" ];then
+  echo "****************"
+  echo -e "${echo_list[@]}"
+  echo "There are ${failed_num} approved errors."
+  echo "****************"
+fi
+
+if [ -n "${echo_list}" ]; then
+  exit 6
+fi
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 049621b9388997deaeab618c09c579858a60d47e..b10e76a4b4d037bfa0d72e74e660cf696f5ee1d3 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index f424d676f70b127d84469bd70d9e7161a93f7bba..9fe58885fa553671cf5c08bd51295f271f4df668 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
+    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python && \
+    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install opencv-python==4.2.0.32
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
@@ -219,4 +219,11 @@ RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0
 RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 CMD source ~/.bashrc
 
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
 EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 6f201a8579fea29ec6eaabf1faca77da26b11882..9f937cf9343784f10d186dd5bdcbace6f8a4e0e9 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -89,7 +89,7 @@ function do_cpython_build {
     fi
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
-    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
     ls ${MY_DIR}
     local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3ab1e68b37557404197cf552cbd0a4def08e9c41
--- /dev/null
+++ b/tools/gen_alias_mapping.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Brief:
+#     This code is used for generating the mapping list of Paddle API alias.
+#     Only the APIs set with the `DEFINE_ALIAS` flag is enable.
+# 
+# Arguments:
+#     None
+# 
+# Usage:
+#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`     
+#
+# Returns:
+#     succ: 0
+# 
+#     Will also print the mapping list to stdout. The format of each line is as below:
+#         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
+
+
+PADDLE_ROOT="$(dirname $(readlink -f ${BASH_SOURCE[0]}))/.."
+
+find ${PADDLE_ROOT}/python/ -name '*.py' \
+    | xargs  grep -v '^#' \
+    | grep 'DEFINE_ALIAS' \
+    | perl -ne '
+        if (/\/python\/(.*):from (\.*)(\w.*) import (.*?)\s+#DEFINE_ALIAS\s+$/) {
+            my @arr = split(", ", $4); 
+            foreach $i (@arr) {
+                printf "%s|%s|%s|%d\n", $3, $i, substr($1, 0, -3), length($2);
+            }
+        }' \
+    | awk -F '[|/]' '
+        {
+            key = "";
+            val = "";
+            if ($2 ~ /.* as .*/) {
+                split($2, arr, " as ");
+                old = arr[1];
+                new = arr[2];
+            } else {
+                old = $2;
+                new = $2;
+            }
+            for (i = 3; i <= (NF - 1 - $NF); ++i) {
+                val = val""$i".";
+            }
+            val =  val""$1"."old
+            for (i = 3; i <= (NF - 1); ++i) {
+                if ($i != "__init__") {
+                    key = key""$i".";
+                }
+            }
+            key = key""new;
+            n2o[key] = val;
+        } 
+        END {
+            for (new in n2o) {
+                old = n2o[new] in n2o ? n2o[n2o[new]] : n2o[new];
+                print old, length(new), new;
+            }
+        }' \
+    | sort -k 1,1 -k 2n,2 \
+    | awk '
+        {
+            o2n[$1] = o2n[$1] ? o2n[$1]","$3 : $3;
+        }
+        END { 
+            for (i in o2n) {
+                print i"\t"o2n[i];
+            }
+        }'
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
index ffef02dba4614f7bbbe13ebc30b40438a52b4590..e3a3374b943bc955d54afbef9755ed5147fad7d2 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 837f0e486f6112bfc645c55ded8dfd0726d414d6..c27fdcea2401c26b1ef1dd377c42930b6e74fcf0 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 
-ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 
@@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
 RUN pip --no-cache-dir install certifi urllib3[secure] 
 
 
-# Install woboq_codebrowser to /woboq
-RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-    (cd /woboq \
-     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-           -DCMAKE_BUILD_TYPE=Release . \
-     make)
 
 # ar mishandles 4GB files
 # https://sourceware.org/bugzilla/show_bug.cgi?id=14625
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 76e1c8baddcd5eeb24f1093d679934d2bbd90730..a18774a8b57b6424e0d89188c537a2086f5aa183 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,6 +27,7 @@ import pydoc
 import hashlib
 import six
 import functools
+import logging
 
 member_dict = collections.OrderedDict()
 
@@ -97,8 +98,11 @@ def queue_dict(member, cur_name):
     member_dict[cur_name] = "({}, ('document', '{}'))".format(args, doc_md5)
 
 
-def visit_member(parent_name, member):
-    cur_name = ".".join([parent_name, member.__name__])
+def visit_member(parent_name, member, member_name=None):
+    if member_name:
+        cur_name = ".".join([parent_name, member_name])
+    else:
+        cur_name = ".".join([parent_name, member.__name__])
     if inspect.isclass(member):
         queue_dict(member, cur_name)
         for name, value in inspect.getmembers(member):
@@ -163,7 +167,13 @@ def visit_all_module(mod):
         if inspect.ismodule(instance):
             visit_all_module(instance)
         else:
-            visit_member(mod.__name__, instance)
+            if member_name != instance.__name__:
+                logging.warn(
+                    "Found alias API, alias name is: {}, original name is: {}".
+                    format(member_name, instance.__name__))
+                visit_member(mod.__name__, instance, member_name)
+            else:
+                visit_member(mod.__name__, instance)
 
 
 modules = sys.argv[1].split(",")
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 102b50c43aeabc6ab2c67840edfaf42615cf51f5..033b4b8723aa30465cdb07198f470d7c09a0f326 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -480,14 +480,8 @@ def get_filenames():
                 filename = ''
                 print("\nWARNING:----Exception in get api filename----\n")
                 print("\n" + api + ' module is ' + module + "\n")
-            if filename != '':
-                # rm contrib file
-                if filename.startswith(
-                        '../python/paddle/fluid/contrib'
-                ) or filename == '../python/paddle/verison.py':
-                    pass
-                elif filename not in filenames:
-                    filenames.append(filename)
+            if filename != '' and filename not in filenames:
+                filenames.append(filename)
             # get all methods
             method = ''
             if inspect.isclass(eval(api)):
@@ -557,14 +551,18 @@ def get_wlist():
 
     '''
     wlist = []
+    wlist_file = []
     with open("wlist.json", 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
-            wlist = wlist + load_dict[key]
-    return wlist
+            if key == 'wlist_file':
+                wlist_file = wlist_file + load_dict[key]
+            else:
+                wlist = wlist + load_dict[key]
+    return wlist, wlist_file
 
 
-wlist = get_wlist()
+wlist, wlist_file = get_wlist()
 
 if len(sys.argv) < 2:
     print("Error: inadequate number of arguments")
@@ -590,8 +588,14 @@ else:
     if len(filenames) == 0 and len(whl_error) == 0:
         print("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    elif '../python/paddle/fluid/core_avx.py' in filenames:
-        filenames.remove('../python/paddle/fluid/core_avx.py')
+    rm_file = []
+    for f in filenames:
+        for w_file in wlist_file:
+            if f.startswith(w_file):
+                rm_file.append(f)
+                filenames.remove(f)
+    if len(rm_file) != 0:
+        print("REMOVE white files: %s" % rm_file)
     print("API_PR is diff from API_DEV: %s" % filenames)
     one_part_filenum = int(math.ceil(len(filenames) / cpus))
     if one_part_filenum == 0:
diff --git a/tools/wlist.json b/tools/wlist.json
index 6989882504eded7c56851e6e9351cef9b4975137..c6114918e5932a9cfd139fd0212698c5ea97d3cc 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -1,4 +1,10 @@
 {
+    "wlist_file" : [
+        "../python/paddle/fluid/contrib", 
+        "../python/paddle/verison.py",
+        "../python/paddle/fluid/core_avx.py",
+        "../python/paddle/distributed"
+    ],
     "wlist_inneed":[
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",
@@ -63,7 +69,6 @@
         "Compressor",
         "Compressor.config",
         "Compressor.run",
-        "run_check",
         "HDFSClient.upload",
         "HDFSClient.download",
         "HDFSClient.is_exist",
@@ -107,12 +112,27 @@
         "Metric.update",
         "Metric.accumulate",
         "Metric.name",
-        "Metric.add_metric_op",
+        "Metric.compute",
         "Accuracy.reset",
         "Accuracy.update",
         "Accuracy.accumulate",
         "Accuracy.name",
-        "Accuracy.add_metric_op",
+        "Accuracy.compute",
+        "Precision.reset",
+        "Precision.update",
+        "Precision.accumulate",
+        "Precision.name",
+        "Precision.compute",
+        "Recall.reset",
+        "Recall.update",
+        "Recall.accumulate",
+        "Recall.name",
+        "Recall.compute",
+        "Auc.reset",
+        "Auc.update",
+        "Auc.accumulate",
+        "Auc.name",
+        "Auc.compute",
         "Callback.set_params",
         "Callback.on_train_begin",
         "Callback.on_train_end",